KeyNoteImpl.mesa
Copyright Ó 1985, 1987 by Xerox Corporation. All rights reserved.
Jack Kent February 23, 1988 10:32:37 am PST
Contents: Implementation of KeyNote
DIRECTORY
Ascii,
Basics,
Convert,
List,
KeyNote,
KeyNoteTokenFreqPerFileTable,
KeyNoteDatabase,
KeyNoteWeightedMatching,
Rope,
SymTab,
FS,
IO;
KeyNoteImpl:
CEDAR
PROGRAM
IMPORTS Basics, Convert, List, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, FS, IO, SymTab
EXPORTS KeyNote =
{
Types
ROPE: TYPE = Rope.ROPE;
after three retries...take your abort like a man
retryLimit: CARDINAL = 3;
Errors
Error: PUBLIC ERROR [ec: KeyNote.ErrorCode, explanation: ROPE ← NIL] = CODE;
BuildStopList:
PUBLIC
PROCEDURE [fileNamesToMatch: KeyNote.FileNamesToMatch, stopListCutOff: KeyNote.StopListCutOff]
RETURNS [KeyNote.ResultList] = {
CheckTokensInDocumentForStopListEligibility:
FS.NameProc = {
token: ROPE;
charsSkipped: INT;
get file stream corresponding to fileName
stream: IO.STREAM ← FS.StreamOpen[fileName: fullFName];
symbol table to remember if word has already occurred within document...so that we don't double count it!
wordInDocumentTable: SymTab.Ref ← SymTab.Create[];
continue ← TRUE;
DO
found: BOOLEAN;
val: REF ANY;
[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];
[found: found, val: val] ← SymTab.Fetch[x: wordInDocumentTable, key: NARROW[token]];
IF ~found
THEN {
[] ← SymTab.Store[x: wordInDocumentTable, key: NARROW[token], val: NEW[INT ← 1]];
KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: universeTokenTable, tokenName: token];
};
ENDLOOP;
SymTab.Erase[x: wordInDocumentTable];
};
ConvertHTToList:
PROC [table: KeyNoteTokenFreqPerFileTable.Table]
RETURNS [List.
LORA] = {
list2: List.LORA;
ChttL: KeyNoteTokenFreqPerFileTable.EachNode = {
add token to list
token: REF ANY ← NARROW[data, KeyNoteTokenFreqPerFileTable.UserData];
TRUSTED { list2 ← CONS[LOOPHOLE[token, REF ANY], list2] };
};
KeyNoteTokenFreqPerFileTable.EnumerateDecreasing[table: universeTokenTable, procToApply:ChttL];
RETURN[list2]
};
CompareProc: List.CompareProc = {
RETURN [Basics.CompareInt[NARROW[ref1, KeyNoteTokenFreqPerFileTable.UserData].frequency, NARROW[ref2, KeyNoteTokenFreqPerFileTable.UserData].frequency ] ];
};
ThrowOutFrequency:
PROC [list: List.
LORA]
RETURNS [List.
LORA] = {
list2: List.LORA;
Tof:
PROC [ item:
REF
ANY, list: List.
LORA ] = {
list2 ← CONS[NARROW[item, KeyNoteTokenFreqPerFileTable.UserData].token, list2];
};
List.Map[ list:list, proc: Tof];
RETURN[list2];
};
build universeTokenTable
universeTokenTable: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];
FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: CheckTokensInDocumentForStopListEligibility, wDir: fileNamesToMatch.wDir];
pick out top "stopListCutOff" words for inclusion in stopList
RETURN[ThrowOutFrequency[List.NthTail[ list: List.Sort[list: ConvertHTToList[table: universeTokenTable], compareProc: CompareProc], n: -stopListCutOff]]];
};
if fileNamesToMatch is not nil, then builds a database in file databaseName on keys to FileNames (and FileNames to keys) using specicied pattern in fileNamesToMatch.
if fileNamesToMatch is NIL then simply opens database.
OpenDatabase:
PUBLIC
PROCEDURE [databaseName:
ROPE, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold, clientDataForVerifierProc:
REF
ANY]
RETURNS [ db: KeyNote.Handle] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
for each document...go thru all words in document and maintain a frequency list...chop off top ones at end
start by opening database
readonly: BOOLEAN ← IF fileNamesToMatch=NIL THEN TRUE ELSE FALSE;
db ← KeyNoteDatabase.OpenDatabase[databaseName: databaseName, readonly: readonly ! KeyNoteDatabase.Error => GO TO DatabaseProblems ];
IF ~ readonly THEN EnumerateFileAndWords[db: db, fileNamesToMatch: fileNamesToMatch, wordVerifierProc: wordVerifierProc, clientDataForVerifierProc: clientDataForVerifierProc, tokenRelevanceThreshhold: tokenRelevanceThreshhold];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
CloseDatabase:
PUBLIC
PROCEDURE [db: KeyNote.Handle] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
KeyNoteDatabase.CloseDatabase[db];
EXITS
DatabaseProblems => {
ERROR Error[$DatabaseProblems];
};
};
document entries are sorted by total weight
FindDocumentsFromWords:
PUBLIC
PROCEDURE [db: KeyNote.Handle, ropeList: KeyNote.ResultList]
RETURNS [resultList: KeyNote.ResultList] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
resultList ← KeyNoteWeightedMatching.WeightedMatch[db:db, ropeList: ropeList];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
callProcForEachValidToken might be adding a token
EnumerateFileAndWords:
PUBLIC
PROCEDURE [db: KeyNote.Handle, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, clientDataForVerifierProc:
REF
ANY, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
NameProc:
FS.NameProc = {
IF KeyNoteDatabase.VerifyFileNonExistence[db: db, fileName: fullFName]
THEN {
token: ROPE;
charsSkipped: INT;
get file stream corresponding to fileName
stream: IO.STREAM ← FS.StreamOpen[fileName: fullFName];
table: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];
continue ← TRUE;
DO
[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];
IF wordVerifierProc[clientDataForVerifierProc, token] THEN KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: table, tokenName: token];
ENDLOOP;
now dump out red black tree to database
DumpTableToDatabase[db: db, table: table, token: token, fileName: fullFName, tokenRelevanceThreshhold: tokenRelevanceThreshhold];
KeyNoteTokenFreqPerFileTable.DestroyTable[table: table];
};
};
FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: NameProc, wDir: fileNamesToMatch.wDir];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
this procedure dumps the contents table out to the database
(which contains entries of the form [tokenName, frequency] )
database that looks like (tokenName, frequency, documentName)
maintain secondary index on tokenName and documentName (since a given token occurs in multiple documents and a given document has nultiple tokens)
DumpTableToDatabase:
PROCEDURE [db: KeyNote.Handle, table: KeyNoteTokenFreqPerFileTable.Table, token:
ROPE, fileName:
ROPE, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {
GetTokenProc: KeyNoteDatabase.GetTokenProc = {
GetTokenProc: TYPE = PROCEDURE [clientData: REF ANY] RETURNS [tokenProcReturnData: TokenProcReturnData];
DO
data: KeyNoteTokenFreqPerFileTable.UserData ← KeyNoteTokenFreqPerFileTable.LookupNextLarger[self: table, tokenName: NARROW[clientData]];
IF data=
NIL
THEN {
tokenProcReturnData ← NIL;
EXIT;
};
IF Basics.CompareInt[data.frequency,tokenRelevanceThreshhold]#less
THEN {
TRUSTED { tokenProcReturnData ← NEW[KeyNoteDatabase.TokenProcReturnDataObject ← [newClientData: LOOPHOLE[data.token, REF ANY] , token: data.token, frequency: data.frequency]] };
EXIT;
};
TRUSTED { clientData ← LOOPHOLE[data.token, REF ANY] };
};
retryCount: INT ← 0;
lowPoint: REF ANY ← Convert.RopeFromChar[from: Ascii.NUL, quote: FALSE];
KeyNoteDatabase.AddTokensToDatabase[db: db, fileName: fileName, getTokenProc: GetTokenProc, clientData: lowPoint]
};