-- adapted from the Mesa formatter's scanner
DIRECTORY
Ascii: TYPE USING [NUL, DEL],
Format USING [Char, Decimal],
ParseDefs USING [Token],
PrintingDefs: TYPE USING [outProc, OutCode],
ParseInterface: TYPE USING [TableRef, HashIndex, TSymbol, VocabHashEntry,
EndMarker, tICON, tNAME, tSTRING, tDIV, tDIVASG],
MStream USING [Handle],
String USING [AppendLongDecimal, AppendLongNumber],
Stream: TYPE USING [EndOfStream, GetChar],
Storage: TYPE USING [CopyString, FreeString, AppendChar, AppendString];
Scanner: PROGRAM IMPORTS Format, PrintingDefs, String, Stream, Storage EXPORTS ParseDefs = {
OPEN ParseInterface;
-- this will allow us to put characters back into the input stream
putBackQueueLimit: CARDINAL = 30;
putBackStack: ARRAY [0..putBackQueueLimit) OF CHARACTER;
putBackTop: CARDINAL;
-- pointers to PGS generated hash table for token lookup
hashTab: LONG POINTER TO ARRAY HashIndex OF VocabHashEntry;
scanTab: LONG POINTER TO ARRAY CHARACTER [40C..177C] OF TSymbol;
vocab: LONG STRING;
vocabIndex: LONG POINTER TO ARRAY TSymbol OF CARDINAL;
-- keeps a history of recent character to print on error
-- the history array is a circular buffer
historyLimit: CARDINAL = 20; -- number of character of history to keep
iHistory: CARDINAL ← 0; -- position in circular history buffer
history: ARRAY [0..historyLimit) OF CHARACTER;
commentString: PUBLIC LONG STRING;
stream: MStream.Handle ← NIL; -- the input stream
char: CHARACTER; -- current (most recently scanned) character
tEnded: BOOLEAN; -- TRUE if the end of input file has been reached
tPosition: CARDINAL; -- character position in the input file
buffer: LONG STRING; -- token assembly area
-- This should be a PUBLIC interface, scanner parameters
-- variables for identifying tokens
CharType: TYPE = {idStartAndContinue, idContinueOnly, digit,
nonIdChar, stringQuote, charQuote, firstCommentChar,
secondCommentChar, underscoreChar, otherChar};
charType: ARRAY CHARACTER OF CharType;
PublicScanInit: PROCEDURE [] = {
i: CHARACTER;
FOR i IN CHARACTER DO charType[i] ← nonIdChar; ENDLOOP;
FOR i IN [Ascii.NUL..' ] DO charType[i] ← otherChar; ENDLOOP;
charType[Ascii.DEL] ← otherChar;
FOR i IN ['a..'z] DO charType[i] ← idStartAndContinue; ENDLOOP;
FOR i IN ['A..'Z] DO charType[i] ← idStartAndContinue; ENDLOOP;
FOR i IN ['0..'9] DO charType[i] ← digit; ENDLOOP;
charType['←] ←underscoreChar;
charType[''] ← charQuote;
charType['"] ← stringQuote;
charType['/] ← firstCommentChar;
charType['*] ← secondCommentChar;
};
NextChar: PROCEDURE = {
IF putBackTop > 0 THEN {
putBackTop ← putBackTop - 1;
char ← putBackStack[putBackTop]; }
ELSE IF tEnded THEN char ← Ascii.NUL
ELSE {
tPosition ← tPosition + 1;
char ← Stream.GetChar[stream !
Stream.EndOfStream => {
char ← Ascii.NUL;
tPosition ← tPosition - 1;
tEnded ← TRUE;
CONTINUE}];
history[iHistory] ← char;
Storage.AppendChar[@commentString, char];
iHistory ← (iHistory + 1) MOD historyLimit;}};
Atom: PUBLIC PROC RETURNS [token: ParseDefs.Token] = {
DO
WHILE char IN [Ascii.NUL..' ] DO
NextChar[];
IF tEnded THEN GO TO EndFile;
ENDLOOP;
token.location ← tPosition;
token.value ← NIL;
IF buffer # NIL THEN Storage.FreeString[buffer];
buffer ← Storage.CopyString[s: ""L, longer: 16];
SELECT charType[char] FROM
digit => {
n: CARDINAL;
radix, digit: CARDINAL;
IF char # '0 THEN
radix ← 10
ELSE
{
NextChar[]; -- skip '0 on octal and hex constants
IF char = 'x OR char = 'X THEN
{
radix ← 16;
NextChar[]; -- skip 'x on hex constants
}
ELSE
radix ← 8;
};
n ← 0;
DO
IF char NOT IN ['0..'9]
AND char NOT IN ['a..'f]
AND char NOT IN ['A..'F]
THEN EXIT;
digit ← SELECT char FROM
IN ['0..'9] => char-'0,
IN ['a..'f] => char-'a,
IN ['A..'F] => char-'A
ENDCASE => 0;
n ← n * radix + digit;
NextChar[];
ENDLOOP;
-- skip past the "long" constant symbol (if there)
IF char = 'l OR char = 'L THEN NextChar[];
token.class ← tICON;
token.value ← Storage.CopyString[s: ""L, longer: 16];
IF n = 0 THEN radix ← 10;
SELECT radix FROM
10 => String.AppendLongDecimal[s: token.value, n: n];
8 => {
String.AppendLongNumber[s: token.value, n: n,
radix: 8];
Storage.AppendChar[@token.value, 'B]; };
16 => {
-- insure that the number begins with a digit [0..9]
-- later do this better (check first hex 'digit')
-- hex conversion does not seem to work
-- try this again later
--Storage.AppendChar[@token.value, '0];
--String.AppendLongNumber[s: token.value, n: n,
-- radix: 16];
--Storage.AppendChar[@token.value, 'H]; };
String.AppendLongNumber[s: token.value, n: n,
radix: 8];
Storage.AppendChar[@token.value, 'B]; };
ENDCASE;
GO TO GotNext};
idStartAndContinue => {
j: CARDINAL;
DO
Storage.AppendChar[@buffer, char];
NextChar[];
SELECT charType[char] FROM
idStartAndContinue, idContinueOnly, digit => NULL;
-- convert underscores to capital Xs
underscoreChar => char ← 'X;
ENDCASE => EXIT;
ENDLOOP;
j ← HashLookUp[buffer];
IF j # 0 THEN {token.class ← j; GO TO GotNext;};
token.class ← tNAME;
token.value ← Storage.CopyString[s: buffer, longer: 32];
GO TO GotNext};
charQuote => {
ch: CHARACTER;
NextChar[];
token.class ← tICON;
token.value ← Storage.CopyString[s: "ORD['"L, longer: 8];
IF char = '\\ THEN {
-- First copy the \ then the char following it
Storage.AppendChar[@token.value, char];
NextChar[];
-- Then copy the character type letter
ch ← char;
NextChar[];
-- In case of octal specified values copy the
-- next two characters.
-- This checks that octal specified characters
-- always use exactly three octal digits. This is
-- required in Mesa but not in C.
IF ch IN ['0..'7] THEN {
ch2, ch3: CHARACTER;
NextChar[]; ch2 ← char;
IF ch2 IN ['0..'7] THEN {
NextChar[]; ch3 ← char;
IF ch3 IN ['0..'7] THEN {
NextChar[];
Storage.AppendChar[@token.value, ch];
Storage.AppendChar[@token.value, ch2];
ch ← ch3; }
ELSE {
Storage.AppendChar[@token.value, '0];
Storage.AppendChar[@token.value, ch];
ch ← ch2; } }
ELSE {
Storage.AppendChar[@token.value, '0];
Storage.AppendChar[@token.value, '0]; };
};
Storage.AppendChar[@token.value, ch]; }
ELSE {
Storage.AppendChar[@token.value, char];
NextChar[]; };
Storage.AppendString[@token.value, "]"L];
IF char # '' THEN ScanError[char, tPosition];
GO TO GetNext;
};
stringQuote => {
i: CARDINAL ← 0;
Storage.AppendString[@buffer, "StringToArray["""L];
NextChar[];
-- This should check that octal specified characters
-- always use exactly three octal digits. This is
-- required in Mesa but not in C.
DO
IF tEnded THEN GO TO EOFEnd;
IF charType[char] = stringQuote THEN GO TO QuoteEnd;
Storage.AppendChar[@buffer, char];
-- ADD CODE TO HANDLE \d and \dd correctly (add 0 or 00)
NextChar[];
REPEAT
QuoteEnd => NULL;
EOFEnd => {ScanError[string, token.location]; char ← Ascii.NUL};
ENDLOOP;
Storage.AppendString[@buffer, " ""]"L];
buffer.text[buffer.length - 3] ← 0C; -- NUL terminate the string
token.class ← tSTRING;
token.value ← Storage.CopyString[s: buffer, longer: 32];
GO TO GetNext;
};
firstCommentChar => {
pChar: CHARACTER;
pChar ← char;
NextChar[];
IF charType[char] # secondCommentChar THEN {
IF char = '= THEN {
token.class ← tDIVASG;
GO TO GetNext; }
ELSE {
token.class ← tDIV;
GO TO GotNext; } };
char ← Ascii.NUL;
DO
pChar ← char;
NextChar[];
IF tEnded THEN GO TO EndFile;
IF charType[char] = firstCommentChar
AND charType[pChar] = secondCommentChar
THEN EXIT;
ENDLOOP;
NextChar[];
};
ENDCASE => {
j: CARDINAL;
DO
Storage.AppendChar[@buffer, char];
NextChar[];
SELECT charType[char] FROM
-- ******** later recognize comments here ********
nonIdChar, firstCommentChar, secondCommentChar => NULL;
ENDCASE => EXIT;
ENDLOOP;
PutBackChar[char];
DO
IF buffer.length = 1 THEN EXIT;
j ← HashLookUp[buffer];
-- special hack because of PGS glitch
IF buffer.length = 2 AND buffer.text[0] = '-
AND buffer.text[1] = '- THEN j ← HashLookUp["!!"L];
IF j # 0 THEN {token.class ← j; GO TO GetNext;};
PutBackChar[buffer.text[buffer.length-1]];
buffer.length ← buffer.length - 1;
ENDLOOP;
token.class ← scanTab[buffer.text[0]];
token.value ← NIL;
GO TO GetNext};
REPEAT
GetNext => NextChar[];
GotNext => NULL;
EndFile => {
char ← Ascii.NUL;
token.class ← EndMarker;
token.location ← tPosition;
token.value ← NIL};
ENDLOOP;
RETURN};
HashLookUp: PROCEDURE [string: LONG STRING] RETURNS [CARDINAL] = {
j: CARDINAL;
s1, s2: CARDINAL;
h: HashIndex;
first: CARDINAL ← ORD[string.text[0]];
last: CARDINAL ← ORD[string.text[string.length - 1]];
h ← ((first*128 - first) + last) MOD LAST[HashIndex] + 1;
WHILE (j ← hashTab[h].symbol) # 0 DO
IF vocabIndex[j] - (s2 ← vocabIndex[j - 1]) = string.length THEN
FOR s1 IN [0..string.length) DO
IF string[s1] # vocab[s2] THEN EXIT;
s2 ← s2 + 1;
REPEAT
FINISHED => RETURN [j];
ENDLOOP;
IF (h ← hashTab[h].link) = 0 THEN EXIT;
ENDLOOP;
RETURN[0];
};
PutBackChar: PROCEDURE [c: CHARACTER] = {
IF putBackTop >= putBackQueueLimit THEN
ScanError[queueLimit, tPosition]
ELSE {
putBackStack[putBackTop] ← c;
putBackTop ← putBackTop + 1; };
};
-- initialization/finalization
ScanInit: PUBLIC PROC [inputStream: MStream.Handle, table: ParseInterface.TableRef] = {
stream ← inputStream;
hashTab ← @table.scanTable.hashTab;
scanTab ← @table.scanTable.scanTab;
vocab ← LOOPHOLE[@table.scanTable.vocabBody];
vocabIndex ← @table.scanTable.vocabIndex;
tEnded ← FALSE;
commentString ← Storage.CopyString[s: ""L, longer: 64];
NextChar[];
tPosition ← 0;
iHistory ← 0;
putBackTop ← 0;
PublicScanInit[];
buffer ← NIL;
};
ScanFinal: PUBLIC PROC [] = {
IF buffer # NIL THEN Storage.FreeString[buffer];
};
-- error handling
ErrorContext: PUBLIC PROCEDURE [] = {
i: CARDINAL;
FOR i IN [iHistory..historyLimit) DO
Format.Char[PrintingDefs.outProc↑, history[i]]; ENDLOOP;
FOR i IN [0..iHistory) DO
Format.Char[PrintingDefs.outProc↑, history[i]]; ENDLOOP;
};
ScanError: PROCEDURE [code: {string, char, debug, queueLimit}, tokenIndex: CARDINAL] = {
PrintingDefs.OutCode[
SELECT code FROM
string => "string unterminated or too long at ["L,
char => "invalid character at ["L,
queueLimit => "put back queue overflow at ["L,
debug => "debug trace type = ["L,
ENDCASE => "scan error at ["L
, 0];
Format.Decimal[PrintingDefs.outProc↑, tokenIndex];
PrintingDefs.OutCode["] around: "L, 0];
ErrorContext[];
PrintingDefs.OutCode["\n"L, 0];
};
}.