ProtoScanner.mesa - derived from Compiler>Scanner.mesa
Copyright © 1985 by Xerox Corporation. All rights reserved.
Satterthwaite, February 4, 1986 2:23:17 pm PST
Maxwell, August 11, 1983 2:22 pm
Paul Rovner, September 22, 1983 9:49 pm
Russ Atkinson (RRA) March 7, 1985 0:57:59 am PST
DIRECTORY
Ascii: TYPE USING [BS, ControlZ, CR, FF, LF, NUL, TAB],
Basics: TYPE USING [charsPerWord, RawBytes],
ConvertUnsafe: TYPE USING [SubString],
IO: TYPE USING [card, EndOf, GetIndex, GetChar, PutChar, PutF, rope, SetIndex, STREAM, UnsafeGetBlock],
HashOps: TYPE USING [EnterString],
P1: TYPE USING [Token, Value, nullValue],
ParseTable: TYPE USING [HashIndex, HashTableRef, IndexTableRef, ScanTableRef, TableRef, VocabularyRef, endMarker, tokenID, tokenSTR],
RefText: TYPE USING [Append],
Rope: TYPE USING [ROPE],
VM: TYPE USING [wordsPerPage];
Scanner:
PROGRAM
IMPORTS HashOps, IO, RefText
EXPORTS P1 = { OPEN ParseTable;
table installation
tablePtr: ParseTable.TableRef;
hashTab: HashTableRef;
scanTab: ScanTableRef;
vocab: VocabularyRef;
vocabIndex: IndexTableRef;
InstallScanTable:
PUBLIC
PROC[base: ParseTable.TableRef] = {
tablePtr ← base;
hashTab ← @tablePtr[tablePtr.scanTable.hashTab];
scanTab ← @tablePtr[tablePtr.scanTable.scanTab];
vocab ← LOOPHOLE[@tablePtr[tablePtr.scanTable.vocabBody]];
vocabIndex ← @tablePtr[tablePtr.scanTable.vocabIndex]};
scanner state
stream: IO.STREAM ← NIL; -- the input stream
streamOrigin: StreamIndex;
Logger: PROC[PROC [log: IO.STREAM]] ← NIL;
textPages: NAT ~ 6;
textWords: NAT ~ textPages*VM.wordsPerPage;
textChars: NAT ~ textWords*Basics.charsPerWord;
TextBuffer: TYPE ~ PACKED ARRAY [0..textChars) OF CHAR;
tB: REF TextBuffer;
tI, tMax: [0..textChars];
tOrigin, tLimit: CARDINAL;
tEnded: BOOL;
FillBuffer:
PROC ~ {
tOrigin ← tLimit;
IF tEnded THEN tMax ← 0
ELSE {
tMax ← stream.UnsafeGetBlock
[[
LOOPHOLE[tB,
LONG
POINTER
TO Basics.RawBytes],
0,
textChars
]].nBytesRead;
IF tMax < textChars THEN tEnded ← TRUE;
tLimit ← tOrigin + tMax};
IF tMax = 0 THEN {tB[0] ← Ascii.NUL; tMax ← 1};
tI ← 0};
buffer: REF TEXT ← NIL; -- token assembly area
iMax: CARDINAL; -- iMax = buffer.maxLength
desc: ConvertUnsafe.SubString; -- initial buffer segment
nTokens: NAT; -- token count
nErrors: NAT; -- lexical errors
BufferOverflow: ERROR ~ CODE;
ExpandBuffer:
PROC ~ {
oldBuffer: REF TEXT ← buffer;
IF oldBuffer.length > 2000 THEN ERROR BufferOverflow;
buffer ← NEW[TEXT[2*oldBuffer.length]];
desc.base ← LOOPHOLE[buffer, LONG STRING];
buffer ← RefText.Append[buffer, oldBuffer];
iMax ← buffer.length ← buffer.maxLength;
oldBuffer ← NIL};
char: CHAR; -- current (most recently scanned) character
qDot: BOOL; -- used to resolved decimal point vs. interval
NextChar:
PROC ~ {
-- also expanded inline within Atom
IF (tI←tI+1) = tMax THEN FillBuffer[]; char ← tB[tI]};
NextToken:
PUBLIC
PROC
RETURNS[token: P1.Token] ~ {
OPEN token;
DO
WHILE char
IN [Ascii.
NUL..' ]
DO
SELECT char
FROM
Ascii.
NUL => {
-- ^@^@ is Tioga escape seq
IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ← tB[tI];
IF char = Ascii.NUL THEN GO TO EndFile};
Ascii.ControlZ =>
-- ^Z is Bravo escape char
UNTIL char = Ascii.
CR
DO
IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ← tB[tI];
ENDLOOP;
ENDCASE => {
IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ← tB[tI]};
ENDLOOP;
index ← tOrigin + tI; value ← P1.nullValue;
SELECT char FROM
'a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j, 'k, 'l, 'm,
'n, 'o, 'p, 'q, 'r, 's, 't, 'u, 'v, 'w, 'x, 'y, 'z => {
i: CARDINAL ← 0;
DO
buffer[i] ← char;
IF (tI←tI+1) = tMax THEN FillBuffer[];
char ← tB[tI];
SELECT char
FROM
IN ['a..'z],
IN ['A..'Z],
IN ['0..'9] =>
IF (i ← i+1) >= iMax THEN ExpandBuffer[];
ENDCASE => EXIT;
ENDLOOP;
desc.length ← i+1;
class ← tokenID; value.r ← HashOps.EnterString[desc];
GO TO GotNext};
'A, 'B, 'C, 'D, 'E, 'F, 'G, 'H, 'I, 'J, 'K, 'L, 'M,
'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y, 'Z => {
i: CARDINAL ← 0;
uId: BOOL ← TRUE;
first, last: NAT ← char.ORD;
DO
buffer[i] ← char;
IF (tI←tI+1) = tMax THEN FillBuffer[];
char ← tB[tI];
SELECT char
FROM
IN ['A..'Z] => {
last ← char.ORD; IF (i ← i+1) >= iMax THEN ExpandBuffer[]};
IN ['a..'z],
IN ['0..'9] => {
uId ← FALSE; IF (i ← i+1) >= iMax THEN ExpandBuffer[]};
ENDCASE => EXIT;
ENDLOOP;
i ← i+1;
IF uId
THEN {
h: HashIndex ← ((first*128-first) + last) MOD HashIndex.LAST + 1;
j, s1, s2: CARDINAL;
WHILE (j ← hashTab[h].symbol) # 0
DO
IF vocabIndex[j]-(s2←vocabIndex[j-1]) = i
THEN
FOR s1
IN [0 .. i)
DO
IF buffer[s1] # vocab.text[s2] THEN EXIT;
s2 ← s2+1;
REPEAT
FINISHED => {class ← j; GO TO GotNext};
ENDLOOP;
IF (h ← hashTab[h].link) = 0 THEN EXIT;
ENDLOOP};
desc.length ← i;
class ← tokenID; value.r ← HashOps.EnterString[desc];
GO TO GotNext};
',, ';, ':, '←, '#, '~, '+, '*, '/, '^, '@, '!,
'=, '.,
'(, '), '[, '], '{, '} => {
class ← scanTab[char]; GO TO GetNext};
'" => {
i: CARDINAL ← 0;
valid: BOOL;
advance: BOOL ← TRUE;
DO
IF advance
THEN {
IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EOFEnd; FillBuffer[]};
char ← tB[tI]};
SELECT char
FROM
'" => {
IF (tI←tI+1) = tMax THEN FillBuffer[];
char ← tB[tI];
IF char # '" THEN GO TO QuoteEnd};
ENDCASE;
IF i >= iMax
THEN ExpandBuffer[
! BufferOverflow => {ScanError[string, index]; i ← 0; CONTINUE}];
[buffer[i], valid, advance] ← Escape[]; i ← i+1;
IF ~valid THEN ScanError[$escape, tOrigin + tI];
REPEAT
QuoteEnd => NULL;
EOFEnd => {ScanError[$string, index]; FillBuffer[]; char ← tB[tI]};
ENDLOOP;
desc.length ← i;
value.r ← HashOps.EnterString[desc];
class ← tokenSTR; GO TO GotNext};
'- => {
NextChar[];
IF char # '-
THEN {
class ← scanTab['-];
IF class = 0 THEN ScanError[char, index-1];
GO TO GotNext};
char ← Ascii.NUL;
DO
pChar: CHAR ~ char;
IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ← tB[tI];
SELECT char
FROM
'- => IF pChar = '- THEN EXIT;
Ascii.CR => EXIT;
ENDCASE;
ENDLOOP;
NextChar[]};
'< => {
NextChar[];
SELECT char
FROM
'< => {
state: {plain, leftBrocket, rightBrocket} ← $plain;
nest: CARDINAL ← 1;
DO
IF (tI←tI+1) = tMax
THEN {
IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ← tB[tI];
SELECT char
FROM
'> =>
SELECT state
FROM
$plain, $leftBrocket => state ← $rightBrocket;
$rightBrocket => {
state ← $plain; nest ← nest - 1; IF nest = 0 THEN EXIT};
ENDCASE;
'< =>
SELECT state
FROM
$plain, $rightBrocket => state ← $leftBrocket;
$leftBrocket => {state ← $plain; nest ← nest + 1};
ENDCASE;
ENDCASE => state ← $plain;
ENDLOOP;
NextChar[]};
ENDCASE => ScanError[$char, index]};
ENDCASE => {
class ← scanTab[char];
IF class # 0 THEN GO TO GetNext;
NextChar[];
ScanError[$char, index]};
REPEAT
GetNext => {IF (tI←tI+1) = tMax THEN FillBuffer[]; char ← tB[tI]};
GotNext => NULL;
EndFile => {
class ← endMarker; index ← tOrigin + (tI-1); value ← P1.nullValue;
UNTIL tEnded DO FillBuffer[] ENDLOOP; -- flush stream
FillBuffer[]; char ← tB[tI]};
ENDLOOP;
nTokens ← nTokens + 1;
RETURN};
numerical conversion
Digit: ARRAY CHAR ['0..'9] OF [0..9] ~ [0,1,2,3,4,5,6,7,8,9];
character and string constants
escapeMark: CHAR ~ '\\;
Escape:
PROC
RETURNS[c:
CHAR, valid, advance:
BOOL←
TRUE] ~ {
c ← char;
IF c = escapeMark
THEN {
NextChar[];
SELECT char
FROM
'n, 'N => c ← Ascii.CR;
'r, 'R => c ← Ascii.CR;
'l, 'L => c ← Ascii.LF;
't, 'T => c ← Ascii.TAB;
'b, 'B => c ← Ascii.BS;
'f, 'F => c ← Ascii.FF;
'', '", escapeMark => c ← char;
IN ['0 .. '7] => {
nc, v: CARDINAL ← 0;
DO
IF ~(char IN ['0..'7]) THEN {valid ← advance ← FALSE; EXIT};
v ← 8*v + Digit[char];
IF (nc ← nc+1) = 3 THEN EXIT;
NextChar[];
ENDLOOP;
IF v > 377b THEN {valid ← FALSE; v ← 0};
c ← VAL[v]};
ENDCASE => valid ← advance ← FALSE};
RETURN};
initialization/finalization
ScanInit:
PUBLIC
PROC[
source: IO.STREAM,
logger: PROC [PROC [log: IO.STREAM]]] ~ {
stream ← source; Logger ← logger;
IF buffer = NIL THEN buffer ← NEW[TEXT[256]];
desc.base ← LOOPHOLE[buffer, LONG STRING]; desc.offset ← 0;
iMax ← buffer.length ← buffer.maxLength;
streamOrigin ← IO.GetIndex[stream];
tB ← NEW[TextBuffer];
tOrigin ← tLimit ← 0; tMax ← 0; tEnded ← FALSE;
FillBuffer[]; char ← tB[tI]; qDot ← FALSE;
nTokens ← nErrors ← 0};
ScanStats:
PUBLIC
PROC
RETURNS[
NAT,
NAT] ~ {
RETURN[nTokens, nErrors]};
ScanReset:
PUBLIC
PROC ~ {
IF buffer # NIL THEN FREE[@buffer];
IF tB # NIL THEN FREE[@tB];
desc.base ← NIL;
stream ← NIL; Logger ← NIL};
error handling
StreamIndex: TYPE ~ INT; -- FileStream.FileByteIndex
charsPerPage: CARDINAL = Basics.charsPerWord*VM.wordsPerPage;
ResetScanIndex:
PUBLIC
PROC[index:
CARDINAL]
RETURNS[success:
BOOL] ~ {
IF ~(index
IN [tOrigin .. tLimit))
THEN {
page: CARDINAL ~ index/charsPerPage;
tOrigin ← tLimit ← page*charsPerPage;
tMax ← 0; tEnded ← FALSE;
IO.SetIndex[stream, streamOrigin + tOrigin];
FillBuffer[]};
tI ← index - tOrigin;
IF tI >= tMax THEN FillBuffer[]; char ← tB[tI]; RETURN[TRUE]};
ErrorCode: TYPE ~ {number, string, char, atom, escape};
ScanError: PROC[code: ErrorCode, tokenIndex: CARDINAL] ~ {
Inner:
PROC [log:
IO.
STREAM] ~ {
ErrorContext[log,
SELECT code
FROM
$number => "invalid number",
$string => "string unterminated or too long",
$char => "invalid character",
$atom => "invalid atom",
$escape => "invalid escape sequence",
ENDCASE => NIL,
tokenIndex];
IO.PutChar[log, '\n]};
nErrors ← nErrors + 1;
Logger[Inner]};
ErrorContext:
PUBLIC
PROC[
to: IO.STREAM, message: Rope.ROPE, tokenIndex: CARDINAL] ~ {
saveIndex: StreamIndex ~ IO.GetIndex[stream];
origin: StreamIndex ~ streamOrigin + tokenIndex;
start, lineIndex: StreamIndex ← origin;
char: CHAR;
n: [1..100];
FOR n
IN [1..100]
UNTIL lineIndex = 0
DO
lineIndex ← lineIndex - 1;
IO.SetIndex[stream, lineIndex];
IF stream.GetChar[] = Ascii.CR THEN EXIT;
start ← lineIndex;
ENDLOOP;
IO.SetIndex[stream, start];
FOR n
IN [1..100]
UNTIL IO.EndOf[stream]
DO
char ← stream.GetChar[];
SELECT char
FROM
Ascii.CR, Ascii.ControlZ => EXIT;
ENDCASE => IO.PutChar[to, char];
ENDLOOP;
IO.PutChar[to, Ascii.CR];
IO.SetIndex[stream, start];
UNTIL
IO.GetIndex[stream] = origin
OR
IO.EndOf[stream]
DO
char ← stream.GetChar[];
IO.PutChar[to, IF char = Ascii.TAB THEN '\t ELSE ' ];
ENDLOOP;
IO.PutF[to, "^ %g[%d]\n", IO.rope[message], IO.card[tokenIndex]];
IO.SetIndex[stream, saveIndex]};
}.