-- adapted from the Mesa formatter's scanner DIRECTORY Ascii: TYPE USING [NUL, DEL], Format USING [Char, Decimal], ParseDefs USING [Token], PrintingDefs: TYPE USING [outProc, OutCode], ParseInterface: TYPE USING [TableRef, HashIndex, TSymbol, VocabHashEntry, EndMarker, tICON, tNAME, tSTRING, tDIV, tDIVASG], MStream USING [Handle], String USING [AppendLongDecimal, AppendLongNumber], Stream: TYPE USING [EndOfStream, GetChar], Storage: TYPE USING [CopyString, FreeString, AppendChar, AppendString]; Scanner: PROGRAM IMPORTS Format, PrintingDefs, String, Stream, Storage EXPORTS ParseDefs = { OPEN ParseInterface; -- this will allow us to put characters back into the input stream putBackQueueLimit: CARDINAL = 30; putBackStack: ARRAY [0..putBackQueueLimit) OF CHARACTER; putBackTop: CARDINAL; -- pointers to PGS generated hash table for token lookup hashTab: LONG POINTER TO ARRAY HashIndex OF VocabHashEntry; scanTab: LONG POINTER TO ARRAY CHARACTER [40C..177C] OF TSymbol; vocab: LONG STRING; vocabIndex: LONG POINTER TO ARRAY TSymbol OF CARDINAL; -- keeps a history of recent character to print on error -- the history array is a circular buffer historyLimit: CARDINAL = 20; -- number of character of history to keep iHistory: CARDINAL ← 0; -- position in circular history buffer history: ARRAY [0..historyLimit) OF CHARACTER; commentString: PUBLIC LONG STRING; stream: MStream.Handle ← NIL; -- the input stream char: CHARACTER; -- current (most recently scanned) character tEnded: BOOLEAN; -- TRUE if the end of input file has been reached tPosition: CARDINAL; -- character position in the input file buffer: LONG STRING; -- token assembly area -- This should be a PUBLIC interface, scanner parameters -- variables for identifying tokens CharType: TYPE = {idStartAndContinue, idContinueOnly, digit, nonIdChar, stringQuote, charQuote, firstCommentChar, secondCommentChar, underscoreChar, otherChar}; charType: ARRAY CHARACTER OF CharType; PublicScanInit: PROCEDURE [] = { i: CHARACTER; FOR i IN CHARACTER DO charType[i] ← nonIdChar; ENDLOOP; FOR i IN [Ascii.NUL..' ] DO charType[i] ← otherChar; ENDLOOP; charType[Ascii.DEL] ← otherChar; FOR i IN ['a..'z] DO charType[i] ← idStartAndContinue; ENDLOOP; FOR i IN ['A..'Z] DO charType[i] ← idStartAndContinue; ENDLOOP; FOR i IN ['0..'9] DO charType[i] ← digit; ENDLOOP; charType['←] ←underscoreChar; charType[''] ← charQuote; charType['"] ← stringQuote; charType['/] ← firstCommentChar; charType['*] ← secondCommentChar; }; NextChar: PROCEDURE = { IF putBackTop > 0 THEN { putBackTop ← putBackTop - 1; char ← putBackStack[putBackTop]; } ELSE IF tEnded THEN char ← Ascii.NUL ELSE { tPosition ← tPosition + 1; char ← Stream.GetChar[stream ! Stream.EndOfStream => { char ← Ascii.NUL; tPosition ← tPosition - 1; tEnded ← TRUE; CONTINUE}]; history[iHistory] ← char; Storage.AppendChar[@commentString, char]; iHistory ← (iHistory + 1) MOD historyLimit;}}; Atom: PUBLIC PROC RETURNS [token: ParseDefs.Token] = { DO WHILE char IN [Ascii.NUL..' ] DO NextChar[]; IF tEnded THEN GO TO EndFile; ENDLOOP; token.location ← tPosition; token.value ← NIL; IF buffer # NIL THEN Storage.FreeString[buffer]; buffer ← Storage.CopyString[s: ""L, longer: 16]; SELECT charType[char] FROM digit => { n: CARDINAL; radix, digit: CARDINAL; IF char # '0 THEN radix ← 10 ELSE { NextChar[]; -- skip '0 on octal and hex constants IF char = 'x OR char = 'X THEN { radix ← 16; NextChar[]; -- skip 'x on hex constants } ELSE radix ← 8; }; n ← 0; DO IF char NOT IN ['0..'9] AND char NOT IN ['a..'f] AND char NOT IN ['A..'F] THEN EXIT; digit ← SELECT char FROM IN ['0..'9] => char-'0, IN ['a..'f] => char-'a, IN ['A..'F] => char-'A ENDCASE => 0; n ← n * radix + digit; NextChar[]; ENDLOOP; -- skip past the "long" constant symbol (if there) IF char = 'l OR char = 'L THEN NextChar[]; token.class ← tICON; token.value ← Storage.CopyString[s: ""L, longer: 16]; IF n = 0 THEN radix ← 10; SELECT radix FROM 10 => String.AppendLongDecimal[s: token.value, n: n]; 8 => { String.AppendLongNumber[s: token.value, n: n, radix: 8]; Storage.AppendChar[@token.value, 'B]; }; 16 => { -- insure that the number begins with a digit [0..9] -- later do this better (check first hex 'digit') -- hex conversion does not seem to work -- try this again later --Storage.AppendChar[@token.value, '0]; --String.AppendLongNumber[s: token.value, n: n, -- radix: 16]; --Storage.AppendChar[@token.value, 'H]; }; String.AppendLongNumber[s: token.value, n: n, radix: 8]; Storage.AppendChar[@token.value, 'B]; }; ENDCASE; GO TO GotNext}; idStartAndContinue => { j: CARDINAL; DO Storage.AppendChar[@buffer, char]; NextChar[]; SELECT charType[char] FROM idStartAndContinue, idContinueOnly, digit => NULL; -- convert underscores to capital Xs underscoreChar => char ← 'X; ENDCASE => EXIT; ENDLOOP; j ← HashLookUp[buffer]; IF j # 0 THEN {token.class ← j; GO TO GotNext;}; token.class ← tNAME; token.value ← Storage.CopyString[s: buffer, longer: 32]; GO TO GotNext}; charQuote => { ch: CHARACTER; NextChar[]; token.class ← tICON; token.value ← Storage.CopyString[s: "ORD['"L, longer: 8]; IF char = '\\ THEN { -- First copy the \ then the char following it Storage.AppendChar[@token.value, char]; NextChar[]; -- Then copy the character type letter ch ← char; NextChar[]; -- In case of octal specified values copy the -- next two characters. -- This checks that octal specified characters -- always use exactly three octal digits. This is -- required in Mesa but not in C. IF ch IN ['0..'7] THEN { ch2, ch3: CHARACTER; NextChar[]; ch2 ← char; IF ch2 IN ['0..'7] THEN { NextChar[]; ch3 ← char; IF ch3 IN ['0..'7] THEN { NextChar[]; Storage.AppendChar[@token.value, ch]; Storage.AppendChar[@token.value, ch2]; ch ← ch3; } ELSE { Storage.AppendChar[@token.value, '0]; Storage.AppendChar[@token.value, ch]; ch ← ch2; } } ELSE { Storage.AppendChar[@token.value, '0]; Storage.AppendChar[@token.value, '0]; }; }; Storage.AppendChar[@token.value, ch]; } ELSE { Storage.AppendChar[@token.value, char]; NextChar[]; }; Storage.AppendString[@token.value, "]"L]; IF char # '' THEN ScanError[char, tPosition]; GO TO GetNext; }; stringQuote => { i: CARDINAL ← 0; Storage.AppendString[@buffer, "StringToArray["""L]; NextChar[]; -- This should check that octal specified characters -- always use exactly three octal digits. This is -- required in Mesa but not in C. DO IF tEnded THEN GO TO EOFEnd; IF charType[char] = stringQuote THEN GO TO QuoteEnd; Storage.AppendChar[@buffer, char]; -- ADD CODE TO HANDLE \d and \dd correctly (add 0 or 00) NextChar[]; REPEAT QuoteEnd => NULL; EOFEnd => {ScanError[string, token.location]; char ← Ascii.NUL}; ENDLOOP; Storage.AppendString[@buffer, " ""]"L]; buffer.text[buffer.length - 3] ← 0C; -- NUL terminate the string token.class ← tSTRING; token.value ← Storage.CopyString[s: buffer, longer: 32]; GO TO GetNext; }; firstCommentChar => { pChar: CHARACTER; pChar ← char; NextChar[]; IF charType[char] # secondCommentChar THEN { IF char = '= THEN { token.class ← tDIVASG; GO TO GetNext; } ELSE { token.class ← tDIV; GO TO GotNext; } }; char ← Ascii.NUL; DO pChar ← char; NextChar[]; IF tEnded THEN GO TO EndFile; IF charType[char] = firstCommentChar AND charType[pChar] = secondCommentChar THEN EXIT; ENDLOOP; NextChar[]; }; ENDCASE => { j: CARDINAL; DO Storage.AppendChar[@buffer, char]; NextChar[]; SELECT charType[char] FROM -- ******** later recognize comments here ******** nonIdChar, firstCommentChar, secondCommentChar => NULL; ENDCASE => EXIT; ENDLOOP; PutBackChar[char]; DO IF buffer.length = 1 THEN EXIT; j ← HashLookUp[buffer]; -- special hack because of PGS glitch IF buffer.length = 2 AND buffer.text[0] = '- AND buffer.text[1] = '- THEN j ← HashLookUp["!!"L]; IF j # 0 THEN {token.class ← j; GO TO GetNext;}; PutBackChar[buffer.text[buffer.length-1]]; buffer.length ← buffer.length - 1; ENDLOOP; token.class ← scanTab[buffer.text[0]]; token.value ← NIL; GO TO GetNext}; REPEAT GetNext => NextChar[]; GotNext => NULL; EndFile => { char ← Ascii.NUL; token.class ← EndMarker; token.location ← tPosition; token.value ← NIL}; ENDLOOP; RETURN}; HashLookUp: PROCEDURE [string: LONG STRING] RETURNS [CARDINAL] = { j: CARDINAL; s1, s2: CARDINAL; h: HashIndex; first: CARDINAL ← ORD[string.text[0]]; last: CARDINAL ← ORD[string.text[string.length - 1]]; h ← ((first*128 - first) + last) MOD LAST[HashIndex] + 1; WHILE (j ← hashTab[h].symbol) # 0 DO IF vocabIndex[j] - (s2 ← vocabIndex[j - 1]) = string.length THEN FOR s1 IN [0..string.length) DO IF string[s1] # vocab[s2] THEN EXIT; s2 ← s2 + 1; REPEAT FINISHED => RETURN [j]; ENDLOOP; IF (h ← hashTab[h].link) = 0 THEN EXIT; ENDLOOP; RETURN[0]; }; PutBackChar: PROCEDURE [c: CHARACTER] = { IF putBackTop >= putBackQueueLimit THEN ScanError[queueLimit, tPosition] ELSE { putBackStack[putBackTop] ← c; putBackTop ← putBackTop + 1; }; }; -- initialization/finalization ScanInit: PUBLIC PROC [inputStream: MStream.Handle, table: ParseInterface.TableRef] = { stream ← inputStream; hashTab ← @table.scanTable.hashTab; scanTab ← @table.scanTable.scanTab; vocab ← LOOPHOLE[@table.scanTable.vocabBody]; vocabIndex ← @table.scanTable.vocabIndex; tEnded ← FALSE; commentString ← Storage.CopyString[s: ""L, longer: 64]; NextChar[]; tPosition ← 0; iHistory ← 0; putBackTop ← 0; PublicScanInit[]; buffer ← NIL; }; ScanFinal: PUBLIC PROC [] = { IF buffer # NIL THEN Storage.FreeString[buffer]; }; -- error handling ErrorContext: PUBLIC PROCEDURE [] = { i: CARDINAL; FOR i IN [iHistory..historyLimit) DO Format.Char[PrintingDefs.outProc↑, history[i]]; ENDLOOP; FOR i IN [0..iHistory) DO Format.Char[PrintingDefs.outProc↑, history[i]]; ENDLOOP; }; ScanError: PROCEDURE [code: {string, char, debug, queueLimit}, tokenIndex: CARDINAL] = { PrintingDefs.OutCode[ SELECT code FROM string => "string unterminated or too long at ["L, char => "invalid character at ["L, queueLimit => "put back queue overflow at ["L, debug => "debug trace type = ["L, ENDCASE => "scan error at ["L , 0]; Format.Decimal[PrintingDefs.outProc↑, tokenIndex]; PrintingDefs.OutCode["] around: "L, 0]; ErrorContext[]; PrintingDefs.OutCode["\n"L, 0]; }; }.