<> <> <> <<>> <> DIRECTORY Ascii, Basics, Convert, List, KeyNote, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, Rope, SymTab, FS, IO; KeyNoteImpl: CEDAR PROGRAM IMPORTS Basics, Convert, List, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, FS, IO, SymTab EXPORTS KeyNote = { <> ROPE: TYPE = Rope.ROPE; <> retryLimit: CARDINAL = 3; <> <<>> Error: PUBLIC ERROR [ec: KeyNote.ErrorCode, explanation: ROPE _ NIL] = CODE; <<>> BuildStopList: PUBLIC PROCEDURE [fileNamesToMatch: KeyNote.FileNamesToMatch, stopListCutOff: KeyNote.StopListCutOff] RETURNS [KeyNote.ResultList] = { CheckTokensInDocumentForStopListEligibility: FS.NameProc = { token: ROPE; charsSkipped: INT; <> stream: IO.STREAM _ FS.StreamOpen[fileName: fullFName]; <> wordInDocumentTable: SymTab.Ref _ SymTab.Create[]; continue _ TRUE; DO found: BOOLEAN; val: REF ANY; [token: token, charsSkipped: charsSkipped] _ IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT]; [found: found, val: val] _ SymTab.Fetch[x: wordInDocumentTable, key: NARROW[token]]; IF ~found THEN { [] _ SymTab.Store[x: wordInDocumentTable, key: NARROW[token], val: NEW[INT _ 1]]; KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: universeTokenTable, tokenName: token]; }; ENDLOOP; SymTab.Erase[x: wordInDocumentTable]; }; ConvertHTToList: PROC [table: KeyNoteTokenFreqPerFileTable.Table] RETURNS [List.LORA] = { list2: List.LORA; ChttL: KeyNoteTokenFreqPerFileTable.EachNode = { <> token: REF ANY _ NARROW[data, KeyNoteTokenFreqPerFileTable.UserData]; TRUSTED { list2 _ CONS[LOOPHOLE[token, REF ANY], list2] }; }; KeyNoteTokenFreqPerFileTable.EnumerateDecreasing[table: universeTokenTable, procToApply:ChttL]; RETURN[list2] }; CompareProc: List.CompareProc = { RETURN [Basics.CompareInt[NARROW[ref1, KeyNoteTokenFreqPerFileTable.UserData].frequency, NARROW[ref2, KeyNoteTokenFreqPerFileTable.UserData].frequency ] ]; }; ThrowOutFrequency: PROC [list: List.LORA] RETURNS [List.LORA] = { list2: List.LORA; Tof: PROC [ item: REF ANY, list: List.LORA ] = { list2 _ CONS[NARROW[item, KeyNoteTokenFreqPerFileTable.UserData].token, list2]; }; List.Map[ list:list, proc: Tof]; RETURN[list2]; }; <> universeTokenTable: KeyNoteTokenFreqPerFileTable.Table _ KeyNoteTokenFreqPerFileTable.Create[]; FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: CheckTokensInDocumentForStopListEligibility, wDir: fileNamesToMatch.wDir]; <> RETURN[ThrowOutFrequency[List.NthTail[ list: List.Sort[list: ConvertHTToList[table: universeTokenTable], compareProc: CompareProc], n: -stopListCutOff]]]; }; <> <> OpenDatabase: PUBLIC PROCEDURE [databaseName: ROPE, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold, clientDataForVerifierProc: REF ANY] RETURNS [ db: KeyNote.Handle] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; <> <> readonly: BOOLEAN _ IF fileNamesToMatch=NIL THEN TRUE ELSE FALSE; db _ KeyNoteDatabase.OpenDatabase[databaseName: databaseName, readonly: readonly ! KeyNoteDatabase.Error => GO TO DatabaseProblems ]; IF ~ readonly THEN EnumerateFileAndWords[db: db, fileNamesToMatch: fileNamesToMatch, wordVerifierProc: wordVerifierProc, clientDataForVerifierProc: clientDataForVerifierProc, tokenRelevanceThreshhold: tokenRelevanceThreshhold]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; CloseDatabase: PUBLIC PROCEDURE [db: KeyNote.Handle] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; KeyNoteDatabase.CloseDatabase[db]; EXITS DatabaseProblems => { ERROR Error[$DatabaseProblems]; }; }; <> FindDocumentsFromWords: PUBLIC PROCEDURE [db: KeyNote.Handle, ropeList: KeyNote.ResultList] RETURNS [resultList: KeyNote.ResultList] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; resultList _ KeyNoteWeightedMatching.WeightedMatch[db:db, ropeList: ropeList]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; <> EnumerateFileAndWords: PUBLIC PROCEDURE [db: KeyNote.Handle, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, clientDataForVerifierProc: REF ANY, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; NameProc: FS.NameProc = { IF KeyNoteDatabase.VerifyFileNonExistence[db: db, fileName: fullFName] THEN { token: ROPE; charsSkipped: INT; <> stream: IO.STREAM _ FS.StreamOpen[fileName: fullFName]; table: KeyNoteTokenFreqPerFileTable.Table _ KeyNoteTokenFreqPerFileTable.Create[]; continue _ TRUE; DO [token: token, charsSkipped: charsSkipped] _ IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT]; IF wordVerifierProc[clientDataForVerifierProc, token] THEN KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: table, tokenName: token]; ENDLOOP; <> DumpTableToDatabase[db: db, table: table, token: token, fileName: fullFName, tokenRelevanceThreshhold: tokenRelevanceThreshhold]; KeyNoteTokenFreqPerFileTable.DestroyTable[table: table]; }; }; FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: NameProc, wDir: fileNamesToMatch.wDir]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; <> <<(which contains entries of the form [tokenName, frequency] )>> << database that looks like (tokenName, frequency, documentName)>> <> DumpTableToDatabase: PROCEDURE [db: KeyNote.Handle, table: KeyNoteTokenFreqPerFileTable.Table, token: ROPE, fileName: ROPE, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = { GetTokenProc: KeyNoteDatabase.GetTokenProc = { <> DO data: KeyNoteTokenFreqPerFileTable.UserData _ KeyNoteTokenFreqPerFileTable.LookupNextLarger[self: table, tokenName: NARROW[clientData]]; IF data=NIL THEN { tokenProcReturnData _ NIL; EXIT; }; IF Basics.CompareInt[data.frequency,tokenRelevanceThreshhold]#less THEN { TRUSTED { tokenProcReturnData _ NEW[KeyNoteDatabase.TokenProcReturnDataObject _ [newClientData: LOOPHOLE[data.token, REF ANY] , token: data.token, frequency: data.frequency]] }; EXIT; }; TRUSTED { clientData _ LOOPHOLE[data.token, REF ANY] }; ENDLOOP; }; retryCount: INT _ 0; lowPoint: REF ANY _ Convert.RopeFromChar[from: Ascii.NUL, quote: FALSE]; KeyNoteDatabase.AddTokensToDatabase[db: db, fileName: fileName, getTokenProc: GetTokenProc, clientData: lowPoint] }; }. <<>>