DIRECTORY Ascii USING [Letter, Lower, SP], BlackCherry USING [Report], IO USING [EndOfStream, EndOf, GetChar, GetInt, GetLineRope, int, PutChar, RIS, rope, ROS, RopeFromROS, STREAM], PatternMatch USING [Register, MatchProc], PFS USING [PathFromRope], PFSExtras USING [PFSNameToUnixName], Rope USING [ROPE], SimMatch, UnixTypes USING [CHARPtr], UserProfile USING [Token], UXStrings USING [Create]; SimMatchImpl: CEDAR PROGRAM IMPORTS Ascii, BlackCherry, IO, PatternMatch, PFS, PFSExtras, UserProfile, UXStrings EXPORTS SimMatch ~ BEGIN ROPE: TYPE ~ Rope.ROPE; docFreqFileName: ROPE _ ".DFs.tapestry"; stoplistName: ROPE _ "/pcedar2.0/famousfiles/stoplist.tapestry"; stemmerName: ROPE _ "/pcedar2.0/famousfiles/englishlexicon.tapestry"; checkProfile: BOOLEAN _ TRUE; newStoplistName, newStemmerName: ROPE; CSimilarityCode: PROC [query, doc: UnixTypes.CHARPtr, pparsed: REF ANY _ NIL] RETURNS [INT] = TRUSTED MACHINE CODE { "similarity" }; CDocFreqCode: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "docfreq_readDFs" }; CCreateStoplist: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "create_stoplist" }; CCreateStemmer: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "create_stemmer" }; Similar: PUBLIC PatternMatch.MatchProc = { score: INT _ 0; stream: IO.STREAM; threshold: INT _ 0; stream _ IO.RIS[value]; stream _ Tokenize[stream]; value _ IO.RopeFromROS[stream]; stream _ IO.RIS[pattern]; threshold _ IO.GetInt[stream ! IO.EndOfStream => GOTO End]; pattern _ IO.GetLineRope[stream ! IO.EndOfStream => GOTO End]; score _ CSimilarityCode[query: UXStrings.Create[pattern], doc: UXStrings.Create[value], pparsed: pparsed]; -- Call C routine IF score >= threshold THEN { BlackCherry.Report["\nMatched! Metric %g >= threshold %g\n", IO.int[score], IO.int[threshold]]; BlackCherry.Report["\tMessage: %g\n", IO.rope[value]]; BlackCherry.Report["\tPattern: %g\n", IO.rope[pattern]]; match _ TRUE; } ELSE { BlackCherry.Report["\nMessage not interesting! Metric %g < threshold %g\n", IO.int[score], IO.int[threshold]]; match _ FALSE; }; pparsedNew _ pparsed; RETURN[match]; EXITS End => {BlackCherry.Report["Problem with query!\n"]; RETURN [FALSE];}; }; Tokenize: PUBLIC PROC [text: IO.STREAM] RETURNS [tokens: IO.STREAM] ~ { ReadAlpha: PROC [instream: IO.STREAM, outstream: IO.STREAM, c: CHAR] RETURNS [char: CHAR] ~ { nextchar: CHAR _ c; WHILE IO.EndOf[instream] = FALSE AND Ascii.Letter[nextchar] = TRUE DO IO.PutChar[outstream, Ascii.Lower[nextchar]]; nextchar _ IO.GetChar[instream]; ENDLOOP; char _ nextchar; }; ReadNonAlpha: PROC [instream: IO.STREAM, c: CHAR] RETURNS [char: CHAR] ~ { nextchar: CHAR _ c; WHILE IO.EndOf[instream] = FALSE AND Ascii.Letter[nextchar] = FALSE DO nextchar _ IO.GetChar[instream]; ENDLOOP; char _ nextchar; }; char: CHAR; output: IO.STREAM _ IO.ROS[]; WHILE IO.EndOf[text] = FALSE DO char _ IO.GetChar[text]; IF Ascii.Letter[char] = TRUE THEN { char _ ReadAlpha[text, output, char]; }; char _ ReadNonAlpha[text, char]; IO.PutChar[output, Ascii.SP]; IF IO.EndOf[text] = FALSE THEN { IO.PutChar[output, Ascii.Lower[char]]; }; ENDLOOP; tokens _ output; }; GetProfileInfo: PROC [] ~ { IF checkProfile THEN { docFreqFileName _ UserProfile.Token[key: "Tapestry.DocFreqName", default: docFreqFileName]; checkProfile _ FALSE; }; }; GetProfileInfo[]; newStoplistName _ PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stoplistName]]; newStemmerName _ PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stemmerName]]; CCreateStoplist[filename: UXStrings.Create[newStoplistName]]; CCreateStemmer[filename: UXStrings.Create[newStemmerName]]; CDocFreqCode[filename: UXStrings.Create[docFreqFileName]]; PatternMatch.Register["sim", Similar]; END. ’ SimMatchImpl.mesa Copyright Σ 1990 by Xerox Corporation. All rights reserved. Sabel, August 23, 1990 3:26 pm PDT Brian Oki, April 16, 1991 9:05 am PDT Implements a similarity routine, based on TDB's. Following path name must be in lower case. Stoplist.tapestry is a file containing all the common words that should be removed from queries and document vectors, since they contribute nothing and don't allow you to distinguish one document from another EnglishLexicon.tapestry is a file containing the dictionary used by the stemmer when reducing words to their root, or stem, form. "similarity" is the name of the procedure in the C code invoked with three arguments. [value: ROPE, pattern: ROPE, pparsed: REF _ NIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN _ FALSE, pparsedNew: REF _ NIL] Compares the value and pattern for similarity. Tokenize value string (pattern is already tokenized) Threshold value was prepended to query text (pattern), so remove it. Later, when the similarity score is calculated, we compare the score with the threshold. Effects: Given input stream "text" consisting of characters, returns a new stream consisting of chunks of alphabetic characters in lower case, each chunk separated by a space character. Skip over non-alphabetic characters. Per-user profile information Must use unix path name for file because these famousfiles are stored in a vux directory. Registration Κz•NewlineDelimiter – "cedar" style™šœ™Icodešœ<™K˜KšœkΟc˜|šœœ˜Kšœ=œ œ˜_K˜Kšœ&œ˜6Kšœ&œ˜8K˜Kšœœ˜ K˜—šœ˜KšœLœ œ˜nKšœœ˜K˜K˜—Kšœ˜Kšœ˜K˜š˜Kšœ5œœ˜F—Kšœ˜K˜—šŸœœœ œœœ œœ˜IK™ΊK˜šŸ œœ œœ œœœœœ˜^Kšœ œ˜š œœœœœ˜EKšœ+˜-Kšœ œ˜ Kšœ˜—K˜K˜—šŸ œœ œœœœœ˜JK™%Kšœ œ˜š œœœœœ˜FKšœ œ˜ Kšœ˜—K˜K˜—K˜Kšœœ˜ Kš œœœœœ˜šœœœ˜Kšœœ˜šœœœ˜#K˜%K˜—K˜ Kšœœ˜šœœœœ˜ Kšœ$˜&K˜—Kšœ˜—K˜K˜K˜—head™šŸœœ˜•StartOfExpansion‚[filterDB: ROPE, user: ROPE, filterName: ROPE, query: ROPE, annot: TapFilter.Annotation, agent: TapFilter.Agent _ NIL]šœœ˜Kšœ[˜[Kšœœ˜K˜—K˜K˜——K˜J˜J™YJšœ.œ˜NJšœ-œ˜LKšœ=˜=Kšœ;˜;K˜Kšœ:˜:K˜L™ K˜Kšœ&˜&—Kšœ˜K˜K˜K™K™—…—Žš