DIRECTORY
Ascii USING [Letter, Lower, SP],
BlackCherry USING [Report],
IO USING [EndOfStream, EndOf, GetChar, GetInt, GetLineRope, int, PutChar, RIS, rope, ROS, RopeFromROS, STREAM],
PatternMatch USING [Register, MatchProc],
PFS USING [PathFromRope],
PFSExtras USING [PFSNameToUnixName],
Rope USING [ROPE],
SimMatch,
UnixTypes USING [CHARPtr],
UserProfile USING [Token],
UXStrings USING [Create];
~
BEGIN
ROPE: TYPE ~ Rope.ROPE;
docFreqFileName: ROPE ← ".DFs.tapestry";
Following path name must be in lower case.
Stoplist.tapestry is a file containing all the common words that should be removed from queries and document vectors, since they contribute nothing and don't allow you to distinguish one document from another
stoplistName: ROPE ← "/pcedar2.0/famousfiles/stoplist.tapestry";
EnglishLexicon.tapestry is a file containing the dictionary used by the stemmer when reducing words to their root, or stem, form.
stemmerName: ROPE ← "/pcedar2.0/famousfiles/englishlexicon.tapestry";
checkProfile: BOOLEAN ← TRUE;
newStoplistName, newStemmerName: ROPE;
"similarity" is the name of the procedure in the C code invoked with three arguments.
CSimilarityCode:
PROC [query, doc: UnixTypes.CHARPtr, pparsed:
REF
ANY ←
NIL]
RETURNS [
INT] =
TRUSTED
MACHINE
CODE {
"similarity"
};
CDocFreqCode:
PROC [filename: UnixTypes.CHARPtr] =
TRUSTED
MACHINE
CODE {
"docfreq←readDFs"
};
CCreateStoplist:
PROC [filename: UnixTypes.CHARPtr] =
TRUSTED
MACHINE
CODE {
"create←stoplist"
};
CCreateStemmer:
PROC [filename: UnixTypes.CHARPtr] =
TRUSTED
MACHINE
CODE {
"create←stemmer"
};
Similar:
PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REF ← NIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REF ← NIL]
Compares the value and pattern for similarity.
score: INT ← 0;
stream: IO.STREAM;
threshold: INT ← 0;
Tokenize value string (pattern is already tokenized)
stream ← IO.RIS[value];
stream ← Tokenize[stream];
value ← IO.RopeFromROS[stream];
stream ← IO.RIS[pattern];
Threshold value was prepended to query text (pattern), so remove it. Later, when the similarity score is calculated, we compare the score with the threshold.
threshold ← IO.GetInt[stream ! IO.EndOfStream => GOTO End];
pattern ← IO.GetLineRope[stream ! IO.EndOfStream => GOTO End];
score ← CSimilarityCode[query: UXStrings.Create[pattern], doc: UXStrings.Create[value], pparsed: pparsed]; -- Call C routine
IF score >= threshold
THEN {
BlackCherry.Report["\nMatched! Metric %g >= threshold %g\n", IO.int[score], IO.int[threshold]];
BlackCherry.Report["\tMessage: %g\n", IO.rope[value]];
BlackCherry.Report["\tPattern: %g\n", IO.rope[pattern]];
match ← TRUE;
}
ELSE {
BlackCherry.Report["\nMessage not interesting! Metric %g < threshold %g\n", IO.int[score], IO.int[threshold]];
match ← FALSE;
};
pparsedNew ← pparsed;
RETURN[match];
EXITS
End => {BlackCherry.Report["Problem with query!\n"]; RETURN [FALSE];};
};
Tokenize:
PUBLIC
PROC [text:
IO.
STREAM]
RETURNS [tokens:
IO.
STREAM] ~ {
Effects: Given input stream "text" consisting of characters, returns a new stream consisting of chunks of alphabetic characters in lower case, each chunk separated by a space character.
ReadAlpha:
PROC [instream:
IO.
STREAM, outstream:
IO.
STREAM, c:
CHAR]
RETURNS [char:
CHAR] ~ {
nextchar: CHAR ← c;
WHILE
IO.EndOf[instream] =
FALSE
AND Ascii.Letter[nextchar] =
TRUE
DO
IO.PutChar[outstream, Ascii.Lower[nextchar]];
nextchar ← IO.GetChar[instream];
ENDLOOP;
char ← nextchar;
};
ReadNonAlpha:
PROC [instream:
IO.
STREAM, c:
CHAR]
RETURNS [char:
CHAR] ~ {
Skip over non-alphabetic characters.
nextchar: CHAR ← c;
WHILE
IO.EndOf[instream] =
FALSE
AND Ascii.Letter[nextchar] =
FALSE
DO
nextchar ← IO.GetChar[instream];
ENDLOOP;
char ← nextchar;
};
char: CHAR;
output: IO.STREAM ← IO.ROS[];
WHILE
IO.EndOf[text] =
FALSE
DO
char ← IO.GetChar[text];
IF Ascii.Letter[char] =
TRUE
THEN {
char ← ReadAlpha[text, output, char];
};
char ← ReadNonAlpha[text, char];
IO.PutChar[output, Ascii.SP];
IF
IO.EndOf[text] =
FALSE
THEN {
IO.PutChar[output, Ascii.Lower[char]];
};
ENDLOOP;
tokens ← output;
};
Per-user profile information
GetProfileInfo:
PROC [] ~ {
IF checkProfile
THEN {
docFreqFileName ← UserProfile.Token[key: "Tapestry.DocFreqName", default: docFreqFileName];
checkProfile ← FALSE;
};
};
GetProfileInfo[];
Must use unix path name for file because these famousfiles are stored in a vux directory.
newStoplistName ← PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stoplistName]];
newStemmerName ← PFSExtras.PFSNameToUnixName[PFS.PathFromRope[stemmerName]];
CCreateStoplist[filename: UXStrings.Create[newStoplistName]];
CCreateStemmer[filename: UXStrings.Create[newStemmerName]];
CDocFreqCode[filename: UXStrings.Create[docFreqFileName]];
Registration
PatternMatch.Register["sim", Similar];