<<>> <> <> <> <> <<>> <<>> DIRECTORY Ascii USING [Letter, Lower, SP], BlackCherry USING [Report], IO USING [EndOfStream, EndOf, GetChar, GetInt, GetLineRope, int, PutChar, RIS, rope, ROS, RopeFromROS, STREAM], PatternMatch USING [Register, MatchProc], PFS USING [PathFromRope], PFSExtras USING [PFSNameToUnixName], Rope USING [ROPE], SimMatch, UnixTypes USING [CHARPtr], UserProfile USING [Token], UXStrings USING [Create]; <> SimMatchImpl: CEDAR PROGRAM IMPORTS Ascii, BlackCherry, IO, PatternMatch, PFS, PFSExtras, UserProfile, UXStrings EXPORTS SimMatch ~ BEGIN ROPE: TYPE ~ Rope.ROPE; docFreqFileName: ROPE _ ".DFs.tapestry"; <> <> stoplistName: ROPE _ "/pcedar2.0/famousfiles/stoplist.tapestry"; <> stemmerName: ROPE _ "/pcedar2.0/famousfiles/englishlexicon.tapestry"; checkProfile: BOOLEAN _ TRUE; newStoplistName, newStemmerName: ROPE; <<"similarity" is the name of the procedure in the C code invoked with three arguments.>> CSimilarityCode: PROC [query, doc: UnixTypes.CHARPtr, pparsed: REF ANY _ NIL] RETURNS [INT] = TRUSTED MACHINE CODE { "similarity" }; CDocFreqCode: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "docfreq_readDFs" }; CCreateStoplist: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "create_stoplist" }; CCreateStemmer: PROC [filename: UnixTypes.CHARPtr] = TRUSTED MACHINE CODE { "create_stemmer" }; Similar: PUBLIC PatternMatch.MatchProc = { <<[value: ROPE, pattern: ROPE, pparsed: REF _ NIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN _ FALSE, pparsedNew: REF _ NIL]>> <> score: INT _ 0; stream: IO.STREAM; threshold: INT _ 0; <> stream _ IO.RIS[value]; stream _ Tokenize[stream]; value _ IO.RopeFromROS[stream]; stream _ IO.RIS[pattern]; <> threshold _ IO.GetInt[stream ! IO.EndOfStream => GOTO End]; pattern _ IO.GetLineRope[stream ! IO.EndOfStream => GOTO End]; score _ CSimilarityCode[query: UXStrings.Create[pattern], doc: UXStrings.Create[value], pparsed: pparsed]; -- Call C routine IF score >= threshold THEN { BlackCherry.Report["\nMatched! Metric %g >= threshold %g\n", IO.int[score], IO.int[threshold]]; BlackCherry.Report["\tMessage: %g\n", IO.rope[value]]; BlackCherry.Report["\tPattern: %g\n", IO.rope[pattern]]; match _ TRUE; } ELSE { BlackCherry.Report["\nMessage not interesting! Metric %g < threshold %g\n", IO.int[score], IO.int[threshold]]; match _ FALSE; }; pparsedNew _ pparsed; RETURN[match]; EXITS End => {BlackCherry.Report["Problem with query!\n"]; RETURN [FALSE];}; }; Tokenize: PUBLIC PROC [text: IO.STREAM] RETURNS [tokens: IO.STREAM] ~ { <> ReadAlpha: PROC [instream: IO.STREAM, outstream: IO.STREAM, c: CHAR] RETURNS [char: CHAR] ~ { nextchar: CHAR _ c; WHILE IO.EndOf[instream] = FALSE AND Ascii.Letter[nextchar] = TRUE DO IO.PutChar[outstream, Ascii.Lower[nextchar]]; nextchar _ IO.GetChar[instream]; ENDLOOP; char _ nextchar; }; ReadNonAlpha: PROC [instream: IO.STREAM, c: CHAR] RETURNS [char: CHAR] ~ { <> nextchar: CHAR _ c; WHILE IO.EndOf[instream] = FALSE AND Ascii.Letter[nextchar] = FALSE DO nextchar _ IO.GetChar[instream]; ENDLOOP; char _ nextchar; }; char: CHAR; output: IO.STREAM _ IO.ROS[]; WHILE IO.EndOf[text] = FALSE DO char _ IO.GetChar[text]; IF Ascii.Letter[char] = TRUE THEN { char _ ReadAlpha[text, output, char]; }; char _ ReadNonAlpha[text, char]; IO.PutChar[output, Ascii.SP]; IF IO.EndOf[text] = FALSE THEN { IO.PutChar[output, Ascii.Lower[char]]; }; ENDLOOP; tokens _ output; }; <> GetProfileInfo: PROC [] ~ { IF checkProfile THEN { docFreqFileName _ UserProfile.Token[key: "Tapestry.DocFreqName", default: docFreqFileName]; checkProfile _ FALSE; }; }; GetProfileInfo[]; <> newStoplistName _ PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stoplistName]]; newStemmerName _ PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stemmerName]]; CCreateStoplist[filename: UXStrings.Create[newStoplistName]]; CCreateStemmer[filename: UXStrings.Create[newStemmerName]]; CDocFreqCode[filename: UXStrings.Create[docFreqFileName]]; <> PatternMatch.Register["sim", Similar]; END. <<>> <<>>