KeyNoteImpl.mesa
Copyright Ó 1985, 1987 by Xerox Corporation. All rights reserved.
Jack Kent February 23, 1988 10:32:37 am PST
Contents: Implementation of KeyNote
DIRECTORY
Ascii,
Basics,
Convert,
List,
KeyNote,
KeyNoteTokenFreqPerFileTable,
KeyNoteDatabase,
KeyNoteWeightedMatching,
Rope,
SymTab,
FS,
IO;
KeyNoteImpl: CEDAR PROGRAM
IMPORTS Basics, Convert, List, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, FS, IO, SymTab
EXPORTS KeyNote =
{
Types
ROPE: TYPE = Rope.ROPE;
after three retries...take your abort like a man
retryLimit: CARDINAL = 3;

Errors
Error: PUBLIC ERROR [ec: KeyNote.ErrorCode, explanation: ROPENIL] = CODE;
BuildStopList: PUBLIC PROCEDURE [fileNamesToMatch: KeyNote.FileNamesToMatch, stopListCutOff: KeyNote.StopListCutOff] RETURNS [KeyNote.ResultList] = {
CheckTokensInDocumentForStopListEligibility: FS.NameProc = {
token: ROPE;
charsSkipped: INT;
get file stream corresponding to fileName
stream: IO.STREAMFS.StreamOpen[fileName: fullFName];
symbol table to remember if word has already occurred within document...so that we don't double count it!
wordInDocumentTable: SymTab.Ref ← SymTab.Create[];
continue ← TRUE;
DO
found: BOOLEAN;
val: REF ANY;
[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];
[found: found, val: val] ← SymTab.Fetch[x: wordInDocumentTable, key: NARROW[token]];
IF ~found THEN {
[] ← SymTab.Store[x: wordInDocumentTable, key: NARROW[token], val: NEW[INT ← 1]];
KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: universeTokenTable, tokenName: token];
};
ENDLOOP;
SymTab.Erase[x: wordInDocumentTable];
};
ConvertHTToList: PROC [table: KeyNoteTokenFreqPerFileTable.Table] RETURNS [List.LORA] = {
list2: List.LORA;
ChttL: KeyNoteTokenFreqPerFileTable.EachNode = {
add token to list
token: REF ANYNARROW[data, KeyNoteTokenFreqPerFileTable.UserData];
TRUSTED { list2 ← CONS[LOOPHOLE[token, REF ANY], list2] };
};
KeyNoteTokenFreqPerFileTable.EnumerateDecreasing[table: universeTokenTable, procToApply:ChttL];
RETURN[list2]
};
CompareProc: List.CompareProc = {
RETURN [Basics.CompareInt[NARROW[ref1, KeyNoteTokenFreqPerFileTable.UserData].frequency, NARROW[ref2, KeyNoteTokenFreqPerFileTable.UserData].frequency ] ];
};
ThrowOutFrequency: PROC [list: List.LORA] RETURNS [List.LORA] = {
list2: List.LORA;
Tof: PROC [ item: REF ANY, list: List.LORA ] = {
list2 ← CONS[NARROW[item, KeyNoteTokenFreqPerFileTable.UserData].token, list2];
};
List.Map[ list:list, proc: Tof];
RETURN[list2];
};
build universeTokenTable
universeTokenTable: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];
FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: CheckTokensInDocumentForStopListEligibility, wDir: fileNamesToMatch.wDir];
pick out top "stopListCutOff" words for inclusion in stopList
RETURN[ThrowOutFrequency[List.NthTail[ list: List.Sort[list: ConvertHTToList[table: universeTokenTable], compareProc: CompareProc], n: -stopListCutOff]]];
};
if fileNamesToMatch is not nil, then builds a database in file databaseName on keys to FileNames (and FileNames to keys) using specicied pattern in fileNamesToMatch.
if fileNamesToMatch is NIL then simply opens database.
OpenDatabase: PUBLIC PROCEDURE [databaseName: ROPE, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold, clientDataForVerifierProc: REF ANY] RETURNS [ db: KeyNote.Handle] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
for each document...go thru all words in document and maintain a frequency list...chop off top ones at end
start by opening database
readonly: BOOLEANIF fileNamesToMatch=NIL THEN TRUE ELSE FALSE;
db ← KeyNoteDatabase.OpenDatabase[databaseName: databaseName, readonly: readonly ! KeyNoteDatabase.Error => GO TO DatabaseProblems ];
IF ~ readonly THEN EnumerateFileAndWords[db: db, fileNamesToMatch: fileNamesToMatch, wordVerifierProc: wordVerifierProc, clientDataForVerifierProc: clientDataForVerifierProc, tokenRelevanceThreshhold: tokenRelevanceThreshhold];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
CloseDatabase: PUBLIC PROCEDURE [db: KeyNote.Handle] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
KeyNoteDatabase.CloseDatabase[db];
EXITS
DatabaseProblems => {
ERROR Error[$DatabaseProblems];
};
};
document entries are sorted by total weight
FindDocumentsFromWords: PUBLIC PROCEDURE [db: KeyNote.Handle, ropeList: KeyNote.ResultList] RETURNS [resultList: KeyNote.ResultList] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
resultList ← KeyNoteWeightedMatching.WeightedMatch[db:db, ropeList: ropeList];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
callProcForEachValidToken might be adding a token
EnumerateFileAndWords: PUBLIC PROCEDURE [db: KeyNote.Handle, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, clientDataForVerifierProc: REF ANY, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {
ENABLE {
UNWIND => NULL;
KeyNoteDatabase.Error => { GO TO DatabaseProblems};
};
NameProc: FS.NameProc = {
IF KeyNoteDatabase.VerifyFileNonExistence[db: db, fileName: fullFName]
THEN {
token: ROPE;
charsSkipped: INT;
get file stream corresponding to fileName
stream: IO.STREAMFS.StreamOpen[fileName: fullFName];
table: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];
continue ← TRUE;
DO
[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];
IF wordVerifierProc[clientDataForVerifierProc, token] THEN KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: table, tokenName: token];
ENDLOOP;
now dump out red black tree to database
DumpTableToDatabase[db: db, table: table, token: token, fileName: fullFName, tokenRelevanceThreshhold: tokenRelevanceThreshhold];
KeyNoteTokenFreqPerFileTable.DestroyTable[table: table];
};
};
FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: NameProc, wDir: fileNamesToMatch.wDir];
EXITS
DatabaseProblems => {
KeyNoteDatabase.CloseDatabase[db];
ERROR Error[$DatabaseProblems];
};
};
this procedure dumps the contents table out to the database
(which contains entries of the form [tokenName, frequency] )
database that looks like (tokenName, frequency, documentName)
maintain secondary index on tokenName and documentName (since a given token occurs in multiple documents and a given document has nultiple tokens)
DumpTableToDatabase: PROCEDURE [db: KeyNote.Handle, table: KeyNoteTokenFreqPerFileTable.Table, token: ROPE, fileName: ROPE, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {
GetTokenProc: KeyNoteDatabase.GetTokenProc = {
GetTokenProc: TYPE = PROCEDURE [clientData: REF ANY] RETURNS [tokenProcReturnData: TokenProcReturnData];
DO
data: KeyNoteTokenFreqPerFileTable.UserData ← KeyNoteTokenFreqPerFileTable.LookupNextLarger[self: table, tokenName: NARROW[clientData]];
IF data=NIL THEN {
tokenProcReturnData ← NIL;
EXIT;
};
IF Basics.CompareInt[data.frequency,tokenRelevanceThreshhold]#less
THEN {
TRUSTED { tokenProcReturnData ← NEW[KeyNoteDatabase.TokenProcReturnDataObject ← [newClientData: LOOPHOLE[data.token, REF ANY] , token: data.token, frequency: data.frequency]] };
EXIT;
};
TRUSTED { clientData ← LOOPHOLE[data.token, REF ANY] };
ENDLOOP;
};
retryCount: INT ← 0;
lowPoint: REF ANY ← Convert.RopeFromChar[from: Ascii.NUL, quote: FALSE];
KeyNoteDatabase.AddTokensToDatabase[db: db, fileName: fileName, getTokenProc: GetTokenProc, clientData: lowPoint]
};
}.