MobScanner.mesa - derived from Compiler>Scanner.mesa
Copyright Ó 1985, 1989, 1991, 1992 by Xerox Corporation. All rights reserved.
Satterthwaite, February 4, 1986 2:23:17 pm PST
Maxwell, August 11, 1983 2:22 pm
Paul Rovner, September 22, 1983 9:49 pm
Russ Atkinson (RRA) March 7, 1985 0:57:59 am PST
Andy Litman May 30, 1988 8:15:33 pm PDT
JKF July 22, 1989 3:39:26 pm PDT
DIRECTORY
Ascii USING [BS, ControlZ, CR, FF, LF, NUL, TAB],
ConvertUnsafe USING [SubString],
IO USING [card, EndOfStream, GetChar, GetIndex, int, Put1, PutChar, PutF, rope, SetIndex, STREAM, UnsafeGetBlock],
MobHashOps USING [EnterString],
MobP1 USING [Token, Value, nullValue],
MobParseTable USING [endMarker, HashIndex, HashTable, HashTableRef, IndexTable, IndexTableRef, InitHashTable, InitIndexTable, InitScanTable, InitVocabulary, ScanTableRef, tokenID, tokenSTR, TSymbol, VocabularyRef],
RefText USING [Append],
Rope USING [ROPE],
VM USING [bytesPerPage, logBytesPerPage, PageCount, PagesForBytes];
MobScanner: PROGRAM
IMPORTS MobHashOps, IO, RefText, MobParseTable, VM
EXPORTS MobP1 = {
OPEN MobParseTable;
table installation
hashTab: HashTableRef;
scanTab: ScanTableRef;
vocab: VocabularyRef;
vocabIndex: IndexTableRef;
InstallScanTable: PUBLIC PROC[] = {
hashTab ¬ MobParseTable.InitHashTable[];
scanTab ¬ MobParseTable.InitScanTable[];
vocab ¬ MobParseTable.InitVocabulary[];
vocabIndex ¬ MobParseTable.InitIndexTable[]};
TypeSym: PUBLIC PROC[log: IO.STREAM, sym: TSymbol] = {
log.PutChar[' ];
IF sym IN [1..endMarker) THEN
FOR i: NAT IN [vocabIndex[sym-1]..vocabIndex[sym]) DO
log.PutChar[LOOPHOLE[vocab, LONG STRING][i]] ENDLOOP
ELSE log.Put1[IO.int[sym]]};
scanner state
stream: IO.STREAM ¬ NIL;  -- the input stream
streamOrigin: StreamIndex;
Logger: PROC[PROC [log: IO.STREAM]] ¬ NIL;
textChars: NAT ~ 4096;
TextBuffer: TYPE ~ PACKED ARRAY [0..textChars) OF CHAR;
tB: REF TextBuffer;
tI, tMax: [0..textChars];
tOrigin, tLimit: CARDINAL ¬ 0;
tEnded: BOOL ¬ FALSE;
FillBuffer: PROC ~ {
tOrigin ¬ tLimit;
IF tEnded THEN tMax ¬ 0
ELSE {
tMax ¬ stream.UnsafeGetBlock[[LOOPHOLE[tB], 0, textChars]].nBytesRead;
IF tMax < textChars THEN tEnded ¬ TRUE;
tLimit ¬ tOrigin + tMax};
IF tMax = 0 THEN {tB[0] ¬ Ascii.NUL; tMax ¬ 1};
tI ¬ 0};
buffer: REF TEXT ¬ NIL;  -- token assembly area
iMax: CARDINAL;   -- iMax = buffer.maxLength
desc: ConvertUnsafe.SubString; -- initial buffer segment
nTokens: NAT ¬ 0;    -- token count
nErrors: NAT;    -- lexical errors
BufferOverflow: ERROR ~ CODE;
ExpandBuffer: PROC ~ {
oldBuffer: REF TEXT ¬ buffer;
IF oldBuffer.length > 2000 THEN ERROR BufferOverflow;
buffer ¬ NEW[TEXT[2*oldBuffer.length]];
desc.base ¬ LOOPHOLE[buffer, LONG STRING];
buffer ¬ RefText.Append[buffer, oldBuffer];
iMax ¬ buffer.length ¬ buffer.maxLength;
oldBuffer ¬ NIL};
char: CHAR;  -- current (most recently scanned) character
qDot: BOOL;  -- used to resolved decimal point vs. interval
NextChar: PROC ~ { -- also expanded inline within Atom
IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]};
NextToken: PUBLIC PROC RETURNS[token: MobP1.Token] ~ {
--OPEN token;
DO
WHILE char IN [Ascii.NUL..' ] DO
SELECT char FROM
Ascii.NUL => {  -- ­@­@ is Tioga escape seq
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
IF char = Ascii.NUL THEN GO TO EndFile};
Ascii.ControlZ =>  -- ­Z is Bravo escape char
UNTIL char = Ascii.CR OR char = Ascii.LF DO
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
ENDLOOP;
ENDCASE => {
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI]};
ENDLOOP;
token.index ¬ tOrigin + tI; token.value ¬ MobP1.nullValue;
SELECT char FROM
'a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j, 'k, 'l, 'm,
'n, 'o, 'p, 'q, 'r, 's, 't, 'u, 'v, 'w, 'x, 'y, 'z => {
i: CARDINAL ¬ 0;
DO
buffer[i] ¬ char;
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
SELECT char FROM
IN ['a..'z], IN ['A..'Z], IN ['0..'9] =>
IF (i ¬ i+1) >= iMax THEN ExpandBuffer[];
ENDCASE => EXIT;
ENDLOOP;
desc.length ¬ i+1;
token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
GO TO GotNext};
'A, 'B, 'C, 'D, 'E, 'F, 'G, 'H, 'I, 'J, 'K, 'L, 'M,
'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y, 'Z => {
i: CARDINAL ¬ 0;
uId: BOOL ¬ TRUE;
first, last: NAT ¬ char.ORD;
DO
buffer[i] ¬ char;
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
SELECT char FROM
IN ['A..'Z] => {
last ¬ char.ORD; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]};
IN ['a..'z], IN ['0..'9] => {
uId ¬ FALSE; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]};
ENDCASE => EXIT;
ENDLOOP;
i ¬ i+1;
IF uId THEN {
h: HashIndex ¬ ((first*128-first) + last) MOD HashIndex.LAST + 1;
j, s1, s2: CARDINAL;
WHILE (j ¬ hashTab[h].symbol) # 0 DO
IF vocabIndex[j]-(s2¬vocabIndex[j-1]) = i THEN
FOR s1 IN [0 .. i) DO
IF buffer[s1] # vocab.text[s2] THEN EXIT;
s2 ¬ s2+1;
REPEAT
FINISHED => {token.class ¬ j; GO TO GotNext};
ENDLOOP;
IF (h ¬ hashTab[h].link) = 0 THEN EXIT;
ENDLOOP};
desc.length ¬ i;
token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
GO TO GotNext};
',, ';, ':, '←, '#, '~, '+, '*, '/, '^, '@, '!,
'=, '.,
'(, '), '[, '], '{, '} => {
token.class ¬ scanTab[char]; GO TO GetNext};
'" => {
i: CARDINAL ¬ 0;
valid: BOOL;
advance: BOOL ¬ TRUE;
DO
IF advance THEN {
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EOFEnd; FillBuffer[]};
char ¬ tB[tI]};
SELECT char FROM
'" => {
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
IF char # '" THEN GO TO QuoteEnd};
ENDCASE;
IF i >= iMax THEN ExpandBuffer[
! BufferOverflow => {ScanError[string, token.index]; i ¬ 0; CONTINUE}];
[buffer[i], valid, advance] ¬ Escape[]; i ¬ i+1;
IF ~valid THEN ScanError[$escape, tOrigin + tI];
REPEAT
QuoteEnd => NULL;
EOFEnd => {ScanError[$string, token.index]; FillBuffer[]; char ¬ tB[tI]};
ENDLOOP;
desc.length ¬ i;
token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
token.class ¬ tokenSTR; GO TO GotNext};
'- => {
NextChar[];
IF char # '- THEN {
token.class ¬ scanTab['-];
IF token.class = 0 THEN ScanError[char, token.index-1];
GO TO GotNext};
char ¬ Ascii.NUL;
DO
pChar: CHAR ~ char;
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
SELECT char FROM
'- => IF pChar = '- THEN EXIT;
Ascii.CR, Ascii.LF => EXIT;
ENDCASE;
ENDLOOP;
NextChar[]};
'< => {
NextChar[];
SELECT char FROM
'< => {
state: {plain, leftBrocket, rightBrocket} ¬ $plain;
nest: CARDINAL ¬ 1;
DO
IF (tI¬tI+1) = tMax THEN {
IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
SELECT char FROM
'> => SELECT state FROM
$plain, $leftBrocket => state ¬ $rightBrocket;
$rightBrocket => {
state ¬ $plain; nest ¬ nest - 1; IF nest = 0 THEN EXIT};
ENDCASE;
'< => SELECT state FROM
$plain, $rightBrocket => state ¬ $leftBrocket;
$leftBrocket => {state ¬ $plain; nest ¬ nest + 1};
ENDCASE;
ENDCASE => state ¬ $plain;
ENDLOOP;
NextChar[]};
ENDCASE => ScanError[$char, token.index]};
ENDCASE => {
token.class ¬ scanTab[char];
IF token.class # 0 THEN GO TO GetNext;
NextChar[];
ScanError[$char, token.index]};
REPEAT
GetNext => {IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]};
GotNext => NULL;
EndFile => {
token.class ¬ endMarker; token.index ¬ tOrigin + (tI-1); token.value ¬ MobP1.nullValue;
UNTIL tEnded DO FillBuffer[] ENDLOOP;  -- flush stream
FillBuffer[]; char ¬ tB[tI]};
ENDLOOP;
nTokens ¬ nTokens + 1;
RETURN};
numerical conversion
Digit: ARRAY CHAR ['0..'9] OF [0..9] ~ [0,1,2,3,4,5,6,7,8,9];
character and string constants
escapeMark: CHAR ~ '\\;
Escape: PROC RETURNS[c: CHAR, valid, advance: BOOL¬TRUE] ~ {
c ¬ char;
IF c = escapeMark THEN {
NextChar[];
SELECT char FROM
'n, 'N => c ¬ Ascii.CR;
'r, 'R => c ¬ Ascii.CR;
'l, 'L => c ¬ Ascii.LF;
't, 'T => c ¬ Ascii.TAB;
'b, 'B => c ¬ Ascii.BS;
'f, 'F => c ¬ Ascii.FF;
'', '", escapeMark => c ¬ char;
IN ['0 .. '7] => {
nc, v: CARDINAL ¬ 0;
DO
IF ~(char IN ['0..'7]) THEN {valid ¬ advance ¬ FALSE; EXIT};
v ¬ 8*v + Digit[char];
IF (nc ¬ nc+1) = 3 THEN EXIT;
NextChar[];
ENDLOOP;
IF v > 377b THEN {valid ¬ FALSE; v ¬ 0};
c ¬ VAL[v]};
ENDCASE => valid ¬ advance ¬ FALSE};
RETURN};
initialization/finalization
ScanInit: PUBLIC PROC[
source: IO.STREAM,
logger: PROC [PROC [log: IO.STREAM]]] ~ {
stream ¬ source; Logger ¬ logger;
IF buffer = NIL THEN buffer ¬ NEW[TEXT[256]];
desc.base ¬ LOOPHOLE[buffer, LONG STRING]; desc.offset ¬ 0;
iMax ¬ buffer.length ¬ buffer.maxLength;
streamOrigin ¬ IO.GetIndex[stream];
tB ¬ NEW[TextBuffer];
tOrigin ¬ tLimit ¬ 0; tMax ¬ 0; tEnded ¬ FALSE;
FillBuffer[]; char ¬ tB[tI]; qDot ¬ FALSE;
nTokens ¬ nErrors ¬ 0};
ScanStats: PUBLIC PROC RETURNS[NAT, NAT] ~ {
RETURN[nTokens, nErrors]};
ScanReset: PUBLIC PROC ~ {
IF buffer # NIL THEN FREE[@buffer];
IF tB # NIL THEN FREE[@tB];
desc.base ¬ NIL;
stream ¬ NIL; Logger ¬ NIL};
error handling
StreamIndex: TYPE ~ INT; -- FileStream.FileByteIndex
charsPerPage: CARDINALVM.bytesPerPage/BYTES[CHAR];
so the sources can be the same in both worlds:
charsPerPage: CARDINAL ¬ VM.PagesForBytes[1]/BYTES[CHAR];
ResetScanIndex: PUBLIC PROC[index: CARDINAL] RETURNS[success: BOOL] ~ {
IF ~(index IN [tOrigin .. tLimit)) THEN {
page: CARDINAL ~ index/charsPerPage;
tOrigin ¬ tLimit ¬ page*charsPerPage;
tMax ¬ 0; tEnded ¬ FALSE;
IO.SetIndex[stream, streamOrigin + tOrigin];
FillBuffer[]};
tI ¬ index - tOrigin;
IF tI >= tMax THEN FillBuffer[]; char ¬ tB[tI]; RETURN[TRUE]};
ErrorCode: TYPE ~ {number, string, char, atom, escape};
ScanError: PROC[code: ErrorCode, tokenIndex: CARDINAL] ~ {
Inner: PROC [log: IO.STREAM] ~ {
ErrorContext[log,
SELECT code FROM
$number => "invalid number",
$string => "string unterminated or too long",
$char => "invalid character",
$atom => "invalid atom",
$escape => "invalid escape sequence",
ENDCASE => NIL,
tokenIndex];
WriteCR[log]};
nErrors ¬ nErrors + 1;
Logger[Inner]};
WriteCR: PROC[stream: IO.STREAM] = {
stream.PutChar['\012]};
ErrorContext: PUBLIC PROC[
to: IO.STREAM, message: Rope.ROPE, tokenIndex: CARDINAL] ~ {
saveIndex: StreamIndex ~ IO.GetIndex[stream];
origin: StreamIndex ~ streamOrigin + tokenIndex;
start, lineIndex: StreamIndex ¬ origin;
char: CHAR;
n: [1..100];
FOR n IN [1..100] UNTIL lineIndex = 0 DO
lineIndex ¬ lineIndex - 1;
IO.SetIndex[stream, lineIndex];
char ¬ stream.GetChar[];
IF char = Ascii.CR OR char = Ascii.LF THEN EXIT;
start ¬ lineIndex;
ENDLOOP;
IO.SetIndex[stream, start];
FOR n IN [1..100] DO
char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT];
SELECT char FROM
Ascii.CR, Ascii.LF, Ascii.ControlZ => EXIT;
ENDCASE => IO.PutChar[to, char];
ENDLOOP;
WriteCR[to];
IO.SetIndex[stream, start];
UNTIL IO.GetIndex[stream] = origin DO
char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT];
IO.PutChar[to, IF char = Ascii.TAB THEN '\t ELSE ' ];
ENDLOOP;
IO.PutF[to, "^ %g[%d]", IO.rope[message], IO.card[tokenIndex]];
WriteCR[to];
IO.SetIndex[stream, saveIndex]};
}.