[Cyan]<CedarChest7.0>KeyNote>KeyNoteImpl.mesa!2

KeyNoteImpl.mesa

Jack Kent February 23, 1988 10:32:37 am PST

Contents: Implementation of KeyNote

DIRECTORY

Ascii,

Basics,

Convert,

List,

KeyNote,

KeyNoteTokenFreqPerFileTable,

KeyNoteDatabase,

KeyNoteWeightedMatching,

Rope,

SymTab,

FS,

IO;

KeyNoteImpl: CEDAR PROGRAM

IMPORTS Basics, Convert, List, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, FS, IO, SymTab

EXPORTS KeyNote =

{

Types

ROPE: TYPE = Rope.ROPE;

after three retries...take your abort like a man

retryLimit: CARDINAL = 3;

Errors

Error: PUBLIC ERROR [ec: KeyNote.ErrorCode, explanation: ROPE ← NIL] = CODE;

BuildStopList: PUBLIC PROCEDURE [fileNamesToMatch: KeyNote.FileNamesToMatch, stopListCutOff: KeyNote.StopListCutOff] RETURNS [KeyNote.ResultList] = {

CheckTokensInDocumentForStopListEligibility: FS.NameProc = {

token: ROPE;

charsSkipped: INT;

get file stream corresponding to fileName

stream: IO.STREAM ← FS.StreamOpen[fileName: fullFName];

symbol table to remember if word has already occurred within document...so that we don't double count it!

wordInDocumentTable: SymTab.Ref ← SymTab.Create[];

continue ← TRUE;

found: BOOLEAN;

val: REF ANY;

[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];

[found: found, val: val] ← SymTab.Fetch[x: wordInDocumentTable, key: NARROW[token]];

IF ~found THEN {

[] ← SymTab.Store[x: wordInDocumentTable, key: NARROW[token], val: NEW[INT ← 1]];

KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: universeTokenTable, tokenName: token];

};

ENDLOOP;

SymTab.Erase[x: wordInDocumentTable];

};

ConvertHTToList: PROC [table: KeyNoteTokenFreqPerFileTable.Table] RETURNS [List.LORA] = {

list2: List.LORA;

ChttL: KeyNoteTokenFreqPerFileTable.EachNode = {

add token to list

token: REF ANY ← NARROW[data, KeyNoteTokenFreqPerFileTable.UserData];

TRUSTED { list2 ← CONS[LOOPHOLE[token, REF ANY], list2] };

};

KeyNoteTokenFreqPerFileTable.EnumerateDecreasing[table: universeTokenTable, procToApply:ChttL];

RETURN[list2]
};

CompareProc: List.CompareProc = {

RETURN [Basics.CompareInt[NARROW[ref1, KeyNoteTokenFreqPerFileTable.UserData].frequency, NARROW[ref2, KeyNoteTokenFreqPerFileTable.UserData].frequency ] ];

};

ThrowOutFrequency: PROC [list: List.LORA] RETURNS [List.LORA] = {

list2: List.LORA;

Tof: PROC [ item: REF ANY, list: List.LORA ] = {

list2 ← CONS[NARROW[item, KeyNoteTokenFreqPerFileTable.UserData].token, list2];

};

List.Map[ list:list, proc: Tof];

RETURN[list2];

};

build universeTokenTable

universeTokenTable: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];

FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: CheckTokensInDocumentForStopListEligibility, wDir: fileNamesToMatch.wDir];

pick out top "stopListCutOff" words for inclusion in stopList

RETURN[ThrowOutFrequency[List.NthTail[ list: List.Sort[list: ConvertHTToList[table: universeTokenTable], compareProc: CompareProc], n: -stopListCutOff]]];

};

if fileNamesToMatch is not nil, then builds a database in file databaseName on keys to FileNames (and FileNames to keys) using specicied pattern in fileNamesToMatch.

if fileNamesToMatch is NIL then simply opens database.

OpenDatabase: PUBLIC PROCEDURE [databaseName: ROPE, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold, clientDataForVerifierProc: REF ANY] RETURNS [ db: KeyNote.Handle] = {

ENABLE {

UNWIND => NULL;

KeyNoteDatabase.Error => { GO TO DatabaseProblems};

};

for each document...go thru all words in document and maintain a frequency list...chop off top ones at end

start by opening database

readonly: BOOLEAN ← IF fileNamesToMatch=NIL THEN TRUE ELSE FALSE;

db ← KeyNoteDatabase.OpenDatabase[databaseName: databaseName, readonly: readonly ! KeyNoteDatabase.Error => GO TO DatabaseProblems ];

IF ~ readonly THEN EnumerateFileAndWords[db: db, fileNamesToMatch: fileNamesToMatch, wordVerifierProc: wordVerifierProc, clientDataForVerifierProc: clientDataForVerifierProc, tokenRelevanceThreshhold: tokenRelevanceThreshhold];

EXITS

DatabaseProblems => {

KeyNoteDatabase.CloseDatabase[db];

ERROR Error[$DatabaseProblems];

};

CloseDatabase: PUBLIC PROCEDURE [db: KeyNote.Handle] = {

ENABLE {

UNWIND => NULL;

KeyNoteDatabase.Error => { GO TO DatabaseProblems};

};

KeyNoteDatabase.CloseDatabase[db];

EXITS

DatabaseProblems => {

ERROR Error[$DatabaseProblems];

};

document entries are sorted by total weight

FindDocumentsFromWords: PUBLIC PROCEDURE [db: KeyNote.Handle, ropeList: KeyNote.ResultList] RETURNS [resultList: KeyNote.ResultList] = {

ENABLE {

UNWIND => NULL;

KeyNoteDatabase.Error => { GO TO DatabaseProblems};

};

resultList ← KeyNoteWeightedMatching.WeightedMatch[db:db, ropeList: ropeList];

EXITS

DatabaseProblems => {

KeyNoteDatabase.CloseDatabase[db];

ERROR Error[$DatabaseProblems];

};

callProcForEachValidToken might be adding a token

EnumerateFileAndWords: PUBLIC PROCEDURE [db: KeyNote.Handle, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, clientDataForVerifierProc: REF ANY, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {

ENABLE {

UNWIND => NULL;

KeyNoteDatabase.Error => { GO TO DatabaseProblems};

};

NameProc: FS.NameProc = {

IF KeyNoteDatabase.VerifyFileNonExistence[db: db, fileName: fullFName]

THEN {

token: ROPE;

charsSkipped: INT;

get file stream corresponding to fileName

stream: IO.STREAM ← FS.StreamOpen[fileName: fullFName];

table: KeyNoteTokenFreqPerFileTable.Table ← KeyNoteTokenFreqPerFileTable.Create[];

continue ← TRUE;

[token: token, charsSkipped: charsSkipped] ← IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT];

IF wordVerifierProc[clientDataForVerifierProc, token] THEN KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: table, tokenName: token];

ENDLOOP;

now dump out red black tree to database

DumpTableToDatabase[db: db, table: table, token: token, fileName: fullFName, tokenRelevanceThreshhold: tokenRelevanceThreshhold];

KeyNoteTokenFreqPerFileTable.DestroyTable[table: table];

};

FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: NameProc, wDir: fileNamesToMatch.wDir];

EXITS

DatabaseProblems => {

KeyNoteDatabase.CloseDatabase[db];

ERROR Error[$DatabaseProblems];

};

this procedure dumps the contents table out to the database

(which contains entries of the form [tokenName, frequency] )

database that looks like (tokenName, frequency, documentName)

maintain secondary index on tokenName and documentName (since a given token occurs in multiple documents and a given document has nultiple tokens)

DumpTableToDatabase: PROCEDURE [db: KeyNote.Handle, table: KeyNoteTokenFreqPerFileTable.Table, token: ROPE, fileName: ROPE, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = {

GetTokenProc: KeyNoteDatabase.GetTokenProc = {

GetTokenProc: TYPE = PROCEDURE [clientData: REF ANY] RETURNS [tokenProcReturnData: TokenProcReturnData];

data: KeyNoteTokenFreqPerFileTable.UserData ← KeyNoteTokenFreqPerFileTable.LookupNextLarger[self: table, tokenName: NARROW[clientData]];

IF data=NIL THEN {

tokenProcReturnData ← NIL;

EXIT;

};

IF Basics.CompareInt[data.frequency,tokenRelevanceThreshhold]#less

THEN {

TRUSTED { tokenProcReturnData ← NEW[KeyNoteDatabase.TokenProcReturnDataObject ← [newClientData: LOOPHOLE[data.token, REF ANY] , token: data.token, frequency: data.frequency]] };

EXIT;

};

TRUSTED { clientData ← LOOPHOLE[data.token, REF ANY] };

ENDLOOP;

};

retryCount: INT ← 0;

lowPoint: REF ANY ← Convert.RopeFromChar[from: Ascii.NUL, quote: FALSE];

KeyNoteDatabase.AddTokensToDatabase[db: db, fileName: fileName, getTokenProc: GetTokenProc, clientData: lowPoint]

};