DIRECTORY Ascii, Basics, Convert, List, KeyNote, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, Rope, SymTab, FS, IO; KeyNoteImpl: CEDAR PROGRAM IMPORTS Basics, Convert, List, KeyNoteTokenFreqPerFileTable, KeyNoteDatabase, KeyNoteWeightedMatching, FS, IO, SymTab EXPORTS KeyNote = { ROPE: TYPE = Rope.ROPE; retryLimit: CARDINAL = 3; Error: PUBLIC ERROR [ec: KeyNote.ErrorCode, explanation: ROPE _ NIL] = CODE; BuildStopList: PUBLIC PROCEDURE [fileNamesToMatch: KeyNote.FileNamesToMatch, stopListCutOff: KeyNote.StopListCutOff] RETURNS [KeyNote.ResultList] = { CheckTokensInDocumentForStopListEligibility: FS.NameProc = { token: ROPE; charsSkipped: INT; stream: IO.STREAM _ FS.StreamOpen[fileName: fullFName]; wordInDocumentTable: SymTab.Ref _ SymTab.Create[]; continue _ TRUE; DO found: BOOLEAN; val: REF ANY; [token: token, charsSkipped: charsSkipped] _ IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT]; [found: found, val: val] _ SymTab.Fetch[x: wordInDocumentTable, key: NARROW[token]]; IF ~found THEN { [] _ SymTab.Store[x: wordInDocumentTable, key: NARROW[token], val: NEW[INT _ 1]]; KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: universeTokenTable, tokenName: token]; }; ENDLOOP; SymTab.Erase[x: wordInDocumentTable]; }; ConvertHTToList: PROC [table: KeyNoteTokenFreqPerFileTable.Table] RETURNS [List.LORA] = { list2: List.LORA; ChttL: KeyNoteTokenFreqPerFileTable.EachNode = { token: REF ANY _ NARROW[data, KeyNoteTokenFreqPerFileTable.UserData]; TRUSTED { list2 _ CONS[LOOPHOLE[token, REF ANY], list2] }; }; KeyNoteTokenFreqPerFileTable.EnumerateDecreasing[table: universeTokenTable, procToApply:ChttL]; RETURN[list2] }; CompareProc: List.CompareProc = { RETURN [Basics.CompareInt[NARROW[ref1, KeyNoteTokenFreqPerFileTable.UserData].frequency, NARROW[ref2, KeyNoteTokenFreqPerFileTable.UserData].frequency ] ]; }; ThrowOutFrequency: PROC [list: List.LORA] RETURNS [List.LORA] = { list2: List.LORA; Tof: PROC [ item: REF ANY, list: List.LORA ] = { list2 _ CONS[NARROW[item, KeyNoteTokenFreqPerFileTable.UserData].token, list2]; }; List.Map[ list:list, proc: Tof]; RETURN[list2]; }; universeTokenTable: KeyNoteTokenFreqPerFileTable.Table _ KeyNoteTokenFreqPerFileTable.Create[]; FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: CheckTokensInDocumentForStopListEligibility, wDir: fileNamesToMatch.wDir]; RETURN[ThrowOutFrequency[List.NthTail[ list: List.Sort[list: ConvertHTToList[table: universeTokenTable], compareProc: CompareProc], n: -stopListCutOff]]]; }; OpenDatabase: PUBLIC PROCEDURE [databaseName: ROPE, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold, clientDataForVerifierProc: REF ANY] RETURNS [ db: KeyNote.Handle] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; readonly: BOOLEAN _ IF fileNamesToMatch=NIL THEN TRUE ELSE FALSE; db _ KeyNoteDatabase.OpenDatabase[databaseName: databaseName, readonly: readonly ! KeyNoteDatabase.Error => GO TO DatabaseProblems ]; IF ~ readonly THEN EnumerateFileAndWords[db: db, fileNamesToMatch: fileNamesToMatch, wordVerifierProc: wordVerifierProc, clientDataForVerifierProc: clientDataForVerifierProc, tokenRelevanceThreshhold: tokenRelevanceThreshhold]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; CloseDatabase: PUBLIC PROCEDURE [db: KeyNote.Handle] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; KeyNoteDatabase.CloseDatabase[db]; EXITS DatabaseProblems => { ERROR Error[$DatabaseProblems]; }; }; FindDocumentsFromWords: PUBLIC PROCEDURE [db: KeyNote.Handle, ropeList: KeyNote.ResultList] RETURNS [resultList: KeyNote.ResultList] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; resultList _ KeyNoteWeightedMatching.WeightedMatch[db:db, ropeList: ropeList]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; EnumerateFileAndWords: PUBLIC PROCEDURE [db: KeyNote.Handle, fileNamesToMatch: KeyNote.FileNamesToMatch, wordVerifierProc: KeyNote.WordVerifierProc, clientDataForVerifierProc: REF ANY, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = { ENABLE { UNWIND => NULL; KeyNoteDatabase.Error => { GO TO DatabaseProblems}; }; NameProc: FS.NameProc = { IF KeyNoteDatabase.VerifyFileNonExistence[db: db, fileName: fullFName] THEN { token: ROPE; charsSkipped: INT; stream: IO.STREAM _ FS.StreamOpen[fileName: fullFName]; table: KeyNoteTokenFreqPerFileTable.Table _ KeyNoteTokenFreqPerFileTable.Create[]; continue _ TRUE; DO [token: token, charsSkipped: charsSkipped] _ IO.GetTokenRope[stream: stream! IO.EndOfStream => EXIT]; IF wordVerifierProc[clientDataForVerifierProc, token] THEN KeyNoteTokenFreqPerFileTable.InsertAndBumpFrequncy[table: table, tokenName: token]; ENDLOOP; DumpTableToDatabase[db: db, table: table, token: token, fileName: fullFName, tokenRelevanceThreshhold: tokenRelevanceThreshhold]; KeyNoteTokenFreqPerFileTable.DestroyTable[table: table]; }; }; FS.EnumerateForNames[pattern: fileNamesToMatch.pattern, proc: NameProc, wDir: fileNamesToMatch.wDir]; EXITS DatabaseProblems => { KeyNoteDatabase.CloseDatabase[db]; ERROR Error[$DatabaseProblems]; }; }; DumpTableToDatabase: PROCEDURE [db: KeyNote.Handle, table: KeyNoteTokenFreqPerFileTable.Table, token: ROPE, fileName: ROPE, tokenRelevanceThreshhold: KeyNote.TokenRelevanceThreshhold] = { GetTokenProc: KeyNoteDatabase.GetTokenProc = { DO data: KeyNoteTokenFreqPerFileTable.UserData _ KeyNoteTokenFreqPerFileTable.LookupNextLarger[self: table, tokenName: NARROW[clientData]]; IF data=NIL THEN { tokenProcReturnData _ NIL; EXIT; }; IF Basics.CompareInt[data.frequency,tokenRelevanceThreshhold]#less THEN { TRUSTED { tokenProcReturnData _ NEW[KeyNoteDatabase.TokenProcReturnDataObject _ [newClientData: LOOPHOLE[data.token, REF ANY] , token: data.token, frequency: data.frequency]] }; EXIT; }; TRUSTED { clientData _ LOOPHOLE[data.token, REF ANY] }; ENDLOOP; }; retryCount: INT _ 0; lowPoint: REF ANY _ Convert.RopeFromChar[from: Ascii.NUL, quote: FALSE]; KeyNoteDatabase.AddTokensToDatabase[db: db, fileName: fileName, getTokenProc: GetTokenProc, clientData: lowPoint] }; }. ΄KeyNoteImpl.mesa Copyright Σ 1985, 1987 by Xerox Corporation. All rights reserved. Jack Kent February 23, 1988 10:32:37 am PST Contents: Implementation of KeyNote Types after three retries...take your abort like a man Errors get file stream corresponding to fileName symbol table to remember if word has already occurred within document...so that we don't double count it! add token to list build universeTokenTable pick out top "stopListCutOff" words for inclusion in stopList if fileNamesToMatch is not nil, then builds a database in file databaseName on keys to FileNames (and FileNames to keys) using specicied pattern in fileNamesToMatch. if fileNamesToMatch is NIL then simply opens database. for each document...go thru all words in document and maintain a frequency list...chop off top ones at end start by opening database document entries are sorted by total weight callProcForEachValidToken might be adding a token get file stream corresponding to fileName now dump out red black tree to database this procedure dumps the contents table out to the database (which contains entries of the form [tokenName, frequency] ) database that looks like (tokenName, frequency, documentName) maintain secondary index on tokenName and documentName (since a given token occurs in multiple documents and a given document has nultiple tokens) GetTokenProc: TYPE = PROCEDURE [clientData: REF ANY] RETURNS [tokenProcReturnData: TokenProcReturnData]; Κ „˜šœ™JšœB™BIcodešœ+™+—J™JšΟnœ™#J˜šΟk ˜ Jšœ˜J˜J˜J˜J˜Jšœ˜J˜Jšœ˜J˜J˜Jšžœ˜Jšžœ˜J˜—š œžœž˜Jšžœ`žœžœ˜uJšžœ ˜J˜—Jšœ˜J˜šœ™Jšžœžœžœ˜J˜J™0Jšœ žœ˜J˜—J˜J˜JšΟb™J™Icode0š œžœžœ&žœžœžœ˜LJ™L˜š œžœž œVžœ˜•š+œžœ ˜™>J–:[list: LIST OF REF ANY, compareProc: List.CompareProc]šžœ”˜šJ˜J˜—Jšœ₯™₯Jšœžœ™6š œžœž œžœ²žœžœžœ˜Žšžœ˜Kšžœžœ˜Kšœžœžœ˜3Kšœ˜—J™jJ™Jšœ žœžœžœžœžœžœžœ˜BJšœlžœžœ˜…Jšžœ žœΡ˜γšž˜šœ˜Jšœ"˜"Kšžœ˜K˜——J˜J˜J˜—š œžœž œ˜9šžœ˜Kšžœžœ˜Kšœžœžœ˜3Kšœ˜—Jšœ"˜"šž˜šœ˜Kšžœ˜K˜——J˜J˜—J˜J˜J™+šœžœž œ5žœ&˜Ššžœ˜Kšžœžœ˜Kšœžœžœ˜3Kšœ˜—JšœN˜Nšž˜šœ˜Jšœ"˜"Kšžœ˜K˜——J˜J˜—Jšœ3™3š œžœž œ‰žœžœB˜ωšžœ˜Kšžœžœ˜Kšœžœžœ˜3Kšœ˜—šœžœ ˜JšžœE˜Gšž˜Jšœžœ˜ Jšœžœ˜Jšœ*™*J–Κ[fileName: ROPE, accessOptions: FS.AccessOptions _ read, streamOptions: FS.StreamOptions _ (3)[TRUE, TRUE, TRUE], keep: CARDINAL _ 1B (1), createByteCount: FS.ByteCount _ 2560, streamBufferParms: FS.StreamBufferParms _ [vmPagesPerBuffer: 8, nBuffers: 2], extendFileProc: FS.ExtendFileProc, wantedCreatedTime: GMT _ nullGMT, remoteCheck: BOOL _ TRUE, wDir: ROPE _ NIL, checkFileType: BOOL _ FALSE, fileType: FS.FileType _ [0B (0)]]šœžœžœžœ!˜7JšœR˜RJšœ žœ˜Jšž˜J–-[stream: STREAM, breakProc: IO.BreakProc]šœ-žœžœžœ˜eJšžœ5žœU˜Jšžœ˜J™'Jšœ‚˜‚Jšœ8˜8J˜—J˜—J–:[pattern: ROPE, proc: FS.NameProc, wDir: ROPE _ NIL]šžœc˜ešž˜šœ˜Kšœ"˜"Kšžœ˜K˜——J˜J˜—Jšœ;™;J™J™“šœž œHžœ žœA˜»š œ"˜.Jš œžœž œžœžœžœ,™hšž˜Jšœtžœ˜ˆšžœžœžœ˜Jšœžœ˜Jšžœ˜Jšœ˜—šžœ@˜BJšžœ˜Jš žœžœ=žœ žœžœ7˜³Jšžœ˜Jšœ˜—Jšžœžœ žœžœ˜8—šžœ˜J˜—J˜—Jšœ žœ˜J–$[from: CHAR, quote: BOOL _ TRUE]š œ žœžœ%žœ žœ˜IJšœq˜qJ˜J˜J˜J˜J˜J˜J˜—˜™J˜—J˜——…—Δ*ό