-- adapted from the Mesa formatter's scanner DIRECTORY Ascii: TYPE USING [NUL, DEL], Format USING [Char, Decimal], ParseDefs USING [Token], PrintingDefs: TYPE USING [outProc, OutCode], ParseInterface: TYPE USING [TableRef, HashIndex, TSymbol, VocabHashEntry, EndMarker, tICON, tNAME, tSTRING, tDIV, tDIVASG], MStream USING [Handle], String USING [AppendLongDecimal, AppendLongNumber], Stream: TYPE USING [EndOfStream, GetChar], Storage: TYPE USING [CopyString, FreeString, AppendChar, AppendString]; Scanner: PROGRAM IMPORTS Format, PrintingDefs, String, Stream, Storage EXPORTS ParseDefs = { OPEN ParseInterface; -- this will allow us to put characters back into the input stream putBackQueueLimit: CARDINAL = 30; putBackStack: ARRAY [0..putBackQueueLimit) OF CHARACTER; putBackTop: CARDINAL; -- pointers to PGS generated hash table for token lookup hashTab: LONG POINTER TO ARRAY HashIndex OF VocabHashEntry; scanTab: LONG POINTER TO ARRAY CHARACTER [40C..177C] OF TSymbol; vocab: LONG STRING; vocabIndex: LONG POINTER TO ARRAY TSymbol OF CARDINAL; -- keeps a history of recent character to print on error -- the history array is a circular buffer historyLimit: CARDINAL = 20; -- number of character of history to keep iHistory: CARDINAL _ 0; -- position in circular history buffer history: ARRAY [0..historyLimit) OF CHARACTER; commentString: PUBLIC LONG STRING; stream: MStream.Handle _ NIL; -- the input stream char: CHARACTER; -- current (most recently scanned) character tEnded: BOOLEAN; -- TRUE if the end of input file has been reached tPosition: CARDINAL; -- character position in the input file buffer: LONG STRING; -- token assembly area -- This should be a PUBLIC interface, scanner parameters -- variables for identifying tokens CharType: TYPE = {idStartAndContinue, idContinueOnly, digit, nonIdChar, stringQuote, charQuote, firstCommentChar, secondCommentChar, underscoreChar, otherChar}; charType: ARRAY CHARACTER OF CharType; PublicScanInit: PROCEDURE [] = { i: CHARACTER; FOR i IN CHARACTER DO charType[i] _ nonIdChar; ENDLOOP; FOR i IN [Ascii.NUL..' ] DO charType[i] _ otherChar; ENDLOOP; charType[Ascii.DEL] _ otherChar; FOR i IN ['a..'z] DO charType[i] _ idStartAndContinue; ENDLOOP; FOR i IN ['A..'Z] DO charType[i] _ idStartAndContinue; ENDLOOP; FOR i IN ['0..'9] DO charType[i] _ digit; ENDLOOP; charType['_] _underscoreChar; charType[''] _ charQuote; charType['"] _ stringQuote; charType['/] _ firstCommentChar; charType['*] _ secondCommentChar; }; NextChar: PROCEDURE = { IF putBackTop > 0 THEN { putBackTop _ putBackTop - 1; char _ putBackStack[putBackTop]; } ELSE IF tEnded THEN char _ Ascii.NUL ELSE { tPosition _ tPosition + 1; char _ Stream.GetChar[stream ! Stream.EndOfStream => { char _ Ascii.NUL; tPosition _ tPosition - 1; tEnded _ TRUE; CONTINUE}]; history[iHistory] _ char; Storage.AppendChar[@commentString, char]; iHistory _ (iHistory + 1) MOD historyLimit;}}; Atom: PUBLIC PROC RETURNS [token: ParseDefs.Token] = { DO WHILE char IN [Ascii.NUL..' ] DO NextChar[]; IF tEnded THEN GO TO EndFile; ENDLOOP; token.location _ tPosition; token.value _ NIL; IF buffer # NIL THEN Storage.FreeString[buffer]; buffer _ Storage.CopyString[s: ""L, longer: 16]; SELECT charType[char] FROM digit => { n: CARDINAL; radix, digit: CARDINAL; IF char # '0 THEN radix _ 10 ELSE { NextChar[]; -- skip '0 on octal and hex constants IF char = 'x OR char = 'X THEN { radix _ 16; NextChar[]; -- skip 'x on hex constants } ELSE radix _ 8; }; n _ 0; DO IF char NOT IN ['0..'9] AND char NOT IN ['a..'f] AND char NOT IN ['A..'F] THEN EXIT; digit _ SELECT char FROM IN ['0..'9] => char-'0, IN ['a..'f] => char-'a, IN ['A..'F] => char-'A ENDCASE => 0; n _ n * radix + digit; NextChar[]; ENDLOOP; -- skip past the "long" constant symbol (if there) IF char = 'l OR char = 'L THEN NextChar[]; token.class _ tICON; token.value _ Storage.CopyString[s: ""L, longer: 16]; IF n = 0 THEN radix _ 10; SELECT radix FROM 10 => String.AppendLongDecimal[s: token.value, n: n]; 8 => { String.AppendLongNumber[s: token.value, n: n, radix: 8]; Storage.AppendChar[@token.value, 'B]; }; 16 => { -- insure that the number begins with a digit [0..9] -- later do this better (check first hex 'digit') -- hex conversion does not seem to work -- try this again later --Storage.AppendChar[@token.value, '0]; --String.AppendLongNumber[s: token.value, n: n, -- radix: 16]; --Storage.AppendChar[@token.value, 'H]; }; String.AppendLongNumber[s: token.value, n: n, radix: 8]; Storage.AppendChar[@token.value, 'B]; }; ENDCASE; GO TO GotNext}; idStartAndContinue => { j: CARDINAL; DO Storage.AppendChar[@buffer, char]; NextChar[]; SELECT charType[char] FROM idStartAndContinue, idContinueOnly, digit => NULL; -- convert underscores to capital Xs underscoreChar => char _ 'X; ENDCASE => EXIT; ENDLOOP; j _ HashLookUp[buffer]; IF j # 0 THEN {token.class _ j; GO TO GotNext;}; token.class _ tNAME; token.value _ Storage.CopyString[s: buffer, longer: 32]; GO TO GotNext}; charQuote => { ch: CHARACTER; NextChar[]; token.class _ tICON; token.value _ Storage.CopyString[s: "ORD['"L, longer: 8]; IF char = '\\ THEN { -- First copy the \ then the char following it Storage.AppendChar[@token.value, char]; NextChar[]; -- Then copy the character type letter ch _ char; NextChar[]; -- In case of octal specified values copy the -- next two characters. -- This checks that octal specified characters -- always use exactly three octal digits. This is -- required in Mesa but not in C. IF ch IN ['0..'7] THEN { ch2, ch3: CHARACTER; NextChar[]; ch2 _ char; IF ch2 IN ['0..'7] THEN { NextChar[]; ch3 _ char; IF ch3 IN ['0..'7] THEN { NextChar[]; Storage.AppendChar[@token.value, ch]; Storage.AppendChar[@token.value, ch2]; ch _ ch3; } ELSE { Storage.AppendChar[@token.value, '0]; Storage.AppendChar[@token.value, ch]; ch _ ch2; } } ELSE { Storage.AppendChar[@token.value, '0]; Storage.AppendChar[@token.value, '0]; }; }; Storage.AppendChar[@token.value, ch]; } ELSE { Storage.AppendChar[@token.value, char]; NextChar[]; }; Storage.AppendString[@token.value, "]"L]; IF char # '' THEN ScanError[char, tPosition]; GO TO GetNext; }; stringQuote => { i: CARDINAL _ 0; Storage.AppendString[@buffer, "StringToArray["""L]; NextChar[]; -- This should check that octal specified characters -- always use exactly three octal digits. This is -- required in Mesa but not in C. DO IF tEnded THEN GO TO EOFEnd; IF charType[char] = stringQuote THEN GO TO QuoteEnd; Storage.AppendChar[@buffer, char]; -- ADD CODE TO HANDLE \d and \dd correctly (add 0 or 00) NextChar[]; REPEAT QuoteEnd => NULL; EOFEnd => {ScanError[string, token.location]; char _ Ascii.NUL}; ENDLOOP; Storage.AppendString[@buffer, " ""]"L]; buffer.text[buffer.length - 3] _ 0C; -- NUL terminate the string token.class _ tSTRING; token.value _ Storage.CopyString[s: buffer, longer: 32]; GO TO GetNext; }; firstCommentChar => { pChar: CHARACTER; pChar _ char; NextChar[]; IF charType[char] # secondCommentChar THEN { IF char = '= THEN { token.class _ tDIVASG; GO TO GetNext; } ELSE { token.class _ tDIV; GO TO GotNext; } }; char _ Ascii.NUL; DO pChar _ char; NextChar[]; IF tEnded THEN GO TO EndFile; IF charType[char] = firstCommentChar AND charType[pChar] = secondCommentChar THEN EXIT; ENDLOOP; NextChar[]; }; ENDCASE => { j: CARDINAL; DO Storage.AppendChar[@buffer, char]; NextChar[]; SELECT charType[char] FROM -- ******** later recognize comments here ******** nonIdChar, firstCommentChar, secondCommentChar => NULL; ENDCASE => EXIT; ENDLOOP; PutBackChar[char]; DO IF buffer.length = 1 THEN EXIT; j _ HashLookUp[buffer]; -- special hack because of PGS glitch IF buffer.length = 2 AND buffer.text[0] = '- AND buffer.text[1] = '- THEN j _ HashLookUp["!!"L]; IF j # 0 THEN {token.class _ j; GO TO GetNext;}; PutBackChar[buffer.text[buffer.length-1]]; buffer.length _ buffer.length - 1; ENDLOOP; token.class _ scanTab[buffer.text[0]]; token.value _ NIL; GO TO GetNext}; REPEAT GetNext => NextChar[]; GotNext => NULL; EndFile => { char _ Ascii.NUL; token.class _ EndMarker; token.location _ tPosition; token.value _ NIL}; ENDLOOP; RETURN}; HashLookUp: PROCEDURE [string: LONG STRING] RETURNS [CARDINAL] = { j: CARDINAL; s1, s2: CARDINAL; h: HashIndex; first: CARDINAL _ ORD[string.text[0]]; last: CARDINAL _ ORD[string.text[string.length - 1]]; h _ ((first*128 - first) + last) MOD LAST[HashIndex] + 1; WHILE (j _ hashTab[h].symbol) # 0 DO IF vocabIndex[j] - (s2 _ vocabIndex[j - 1]) = string.length THEN FOR s1 IN [0..string.length) DO IF string[s1] # vocab[s2] THEN EXIT; s2 _ s2 + 1; REPEAT FINISHED => RETURN [j]; ENDLOOP; IF (h _ hashTab[h].link) = 0 THEN EXIT; ENDLOOP; RETURN[0]; }; PutBackChar: PROCEDURE [c: CHARACTER] = { IF putBackTop >= putBackQueueLimit THEN ScanError[queueLimit, tPosition] ELSE { putBackStack[putBackTop] _ c; putBackTop _ putBackTop + 1; }; }; -- initialization/finalization ScanInit: PUBLIC PROC [inputStream: MStream.Handle, table: ParseInterface.TableRef] = { stream _ inputStream; hashTab _ @table.scanTable.hashTab; scanTab _ @table.scanTable.scanTab; vocab _ LOOPHOLE[@table.scanTable.vocabBody]; vocabIndex _ @table.scanTable.vocabIndex; tEnded _ FALSE; commentString _ Storage.CopyString[s: ""L, longer: 64]; NextChar[]; tPosition _ 0; iHistory _ 0; putBackTop _ 0; PublicScanInit[]; buffer _ NIL; }; ScanFinal: PUBLIC PROC [] = { IF buffer # NIL THEN Storage.FreeString[buffer]; }; -- error handling ErrorContext: PUBLIC PROCEDURE [] = { i: CARDINAL; FOR i IN [iHistory..historyLimit) DO Format.Char[PrintingDefs.outProc^, history[i]]; ENDLOOP; FOR i IN [0..iHistory) DO Format.Char[PrintingDefs.outProc^, history[i]]; ENDLOOP; }; ScanError: PROCEDURE [code: {string, char, debug, queueLimit}, tokenIndex: CARDINAL] = { PrintingDefs.OutCode[ SELECT code FROM string => "string unterminated or too long at ["L, char => "invalid character at ["L, queueLimit => "put back queue overflow at ["L, debug => "debug trace type = ["L, ENDCASE => "scan error at ["L , 0]; Format.Decimal[PrintingDefs.outProc^, tokenIndex]; PrintingDefs.OutCode["] around: "L, 0]; ErrorContext[]; PrintingDefs.OutCode["\n"L, 0]; }; }.