MobScanner.mesa - derived from Compiler>Scanner.mesa
Copyright Ó 1985, 1989, 1991, 1992 by Xerox Corporation. All rights reserved.
Satterthwaite, February 4, 1986 2:23:17 pm PST
Maxwell, August 11, 1983 2:22 pm
Paul Rovner, September 22, 1983 9:49 pm
Russ Atkinson (RRA) March 7, 1985 0:57:59 am PST
Andy Litman May 30, 1988 8:15:33 pm PDT
JKF July 22, 1989 3:39:26 pm PDT
DIRECTORY
Ascii USING [BS, ControlZ, CR, FF, LF, NUL, TAB],
ConvertUnsafe USING [SubString],
IO USING [card, EndOfStream, GetChar, GetIndex, int, Put1, PutChar, PutF, rope, SetIndex, STREAM, UnsafeGetBlock],
MobHashOps USING [EnterString],
MobP1 USING [Token, Value, nullValue],
MobParseTable USING [endMarker, HashIndex, HashTable, HashTableRef, IndexTable, IndexTableRef, InitHashTable, InitIndexTable, InitScanTable, InitVocabulary, ScanTableRef, tokenID, tokenSTR, TSymbol, VocabularyRef],
RefText USING [Append],
Rope USING [ROPE],
VM USING [bytesPerPage, logBytesPerPage, PageCount, PagesForBytes];
MobScanner:
PROGRAM
IMPORTS MobHashOps, IO, RefText, MobParseTable, VM
EXPORTS MobP1 = {
OPEN MobParseTable;
table installation
hashTab: HashTableRef;
scanTab: ScanTableRef;
vocab: VocabularyRef;
vocabIndex: IndexTableRef;
InstallScanTable:
PUBLIC
PROC[] = {
hashTab ¬ MobParseTable.InitHashTable[];
scanTab ¬ MobParseTable.InitScanTable[];
vocab ¬ MobParseTable.InitVocabulary[];
vocabIndex ¬ MobParseTable.InitIndexTable[]};
TypeSym:
PUBLIC
PROC[log:
IO.
STREAM, sym: TSymbol] = {
log.PutChar[' ];
IF sym
IN [1..endMarker)
THEN
FOR i:
NAT
IN [vocabIndex[sym-1]..vocabIndex[sym])
DO
log.PutChar[LOOPHOLE[vocab, LONG STRING][i]] ENDLOOP
ELSE log.Put1[IO.int[sym]]};
scanner state
stream: IO.STREAM ¬ NIL; -- the input stream
streamOrigin: StreamIndex;
Logger: PROC[PROC [log: IO.STREAM]] ¬ NIL;
textChars: NAT ~ 4096;
TextBuffer: TYPE ~ PACKED ARRAY [0..textChars) OF CHAR;
tB: REF TextBuffer;
tI, tMax: [0..textChars];
tOrigin, tLimit: CARDINAL ¬ 0;
tEnded: BOOL ¬ FALSE;
FillBuffer:
PROC ~ {
tOrigin ¬ tLimit;
IF tEnded THEN tMax ¬ 0
ELSE {
tMax ¬ stream.UnsafeGetBlock[[LOOPHOLE[tB], 0, textChars]].nBytesRead;
IF tMax < textChars THEN tEnded ¬ TRUE;
tLimit ¬ tOrigin + tMax};
IF tMax = 0 THEN {tB[0] ¬ Ascii.NUL; tMax ¬ 1};
tI ¬ 0};
buffer: REF TEXT ¬ NIL; -- token assembly area
iMax: CARDINAL; -- iMax = buffer.maxLength
desc: ConvertUnsafe.SubString; -- initial buffer segment
nTokens: NAT ¬ 0; -- token count
nErrors: NAT; -- lexical errors
BufferOverflow: ERROR ~ CODE;
ExpandBuffer:
PROC ~ {
oldBuffer: REF TEXT ¬ buffer;
IF oldBuffer.length > 2000 THEN ERROR BufferOverflow;
buffer ¬ NEW[TEXT[2*oldBuffer.length]];
desc.base ¬ LOOPHOLE[buffer, LONG STRING];
buffer ¬ RefText.Append[buffer, oldBuffer];
iMax ¬ buffer.length ¬ buffer.maxLength;
oldBuffer ¬ NIL};
char: CHAR; -- current (most recently scanned) character
qDot: BOOL; -- used to resolved decimal point vs. interval
NextChar:
PROC ~ {
-- also expanded inline within Atom
IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]};
NextToken:
PUBLIC
PROC
RETURNS[token: MobP1.Token] ~ {
--OPEN token;
DO
WHILE char
IN [Ascii.
NUL..' ]
DO
SELECT char
FROM
Ascii.
NUL => {
-- @@ is Tioga escape seq
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
IF char = Ascii.NUL THEN GO TO EndFile};
Ascii.ControlZ =>
-- Z is Bravo escape char
UNTIL char = Ascii.
CR
OR char = Ascii.
LF
DO
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
ENDLOOP;
ENDCASE => {
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI]};
ENDLOOP;
token.index ¬ tOrigin + tI; token.value ¬ MobP1.nullValue;
SELECT char FROM
'a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j, 'k, 'l, 'm,
'n, 'o, 'p, 'q, 'r, 's, 't, 'u, 'v, 'w, 'x, 'y, 'z => {
i: CARDINAL ¬ 0;
DO
buffer[i] ¬ char;
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
SELECT char
FROM
IN ['a..'z],
IN ['A..'Z],
IN ['0..'9] =>
IF (i ¬ i+1) >= iMax THEN ExpandBuffer[];
ENDCASE => EXIT;
ENDLOOP;
desc.length ¬ i+1;
token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
GO TO GotNext};
'A, 'B, 'C, 'D, 'E, 'F, 'G, 'H, 'I, 'J, 'K, 'L, 'M,
'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y, 'Z => {
i: CARDINAL ¬ 0;
uId: BOOL ¬ TRUE;
first, last: NAT ¬ char.ORD;
DO
buffer[i] ¬ char;
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
SELECT char
FROM
IN ['A..'Z] => {
last ¬ char.ORD; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]};
IN ['a..'z],
IN ['0..'9] => {
uId ¬ FALSE; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]};
ENDCASE => EXIT;
ENDLOOP;
i ¬ i+1;
IF uId
THEN {
h: HashIndex ¬ ((first*128-first) + last) MOD HashIndex.LAST + 1;
j, s1, s2: CARDINAL;
WHILE (j ¬ hashTab[h].symbol) # 0
DO
IF vocabIndex[j]-(s2¬vocabIndex[j-1]) = i
THEN
FOR s1
IN [0 .. i)
DO
IF buffer[s1] # vocab.text[s2] THEN EXIT;
s2 ¬ s2+1;
REPEAT
FINISHED => {token.class ¬ j; GO TO GotNext};
ENDLOOP;
IF (h ¬ hashTab[h].link) = 0 THEN EXIT;
ENDLOOP};
desc.length ¬ i;
token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
GO TO GotNext};
',, ';, ':, '←, '#, '~, '+, '*, '/, '^, '@, '!,
'=, '.,
'(, '), '[, '], '{, '} => {
token.class ¬ scanTab[char]; GO TO GetNext};
'" => {
i: CARDINAL ¬ 0;
valid: BOOL;
advance: BOOL ¬ TRUE;
DO
IF advance
THEN {
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EOFEnd; FillBuffer[]};
char ¬ tB[tI]};
SELECT char
FROM
'" => {
IF (tI¬tI+1) = tMax THEN FillBuffer[];
char ¬ tB[tI];
IF char # '" THEN GO TO QuoteEnd};
ENDCASE;
IF i >= iMax
THEN ExpandBuffer[
! BufferOverflow => {ScanError[string, token.index]; i ¬ 0; CONTINUE}];
[buffer[i], valid, advance] ¬ Escape[]; i ¬ i+1;
IF ~valid THEN ScanError[$escape, tOrigin + tI];
REPEAT
QuoteEnd => NULL;
EOFEnd => {ScanError[$string, token.index]; FillBuffer[]; char ¬ tB[tI]};
ENDLOOP;
desc.length ¬ i;
token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]];
token.class ¬ tokenSTR; GO TO GotNext};
'- => {
NextChar[];
IF char # '-
THEN {
token.class ¬ scanTab['-];
IF token.class = 0 THEN ScanError[char, token.index-1];
GO TO GotNext};
char ¬ Ascii.NUL;
DO
pChar: CHAR ~ char;
IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
SELECT char
FROM
'- => IF pChar = '- THEN EXIT;
Ascii.CR, Ascii.LF => EXIT;
ENDCASE;
ENDLOOP;
NextChar[]};
'< => {
NextChar[];
SELECT char
FROM
'< => {
state: {plain, leftBrocket, rightBrocket} ¬ $plain;
nest: CARDINAL ¬ 1;
DO
IF (tI¬tI+1) = tMax
THEN {
IF tEnded THEN GO TO EndFile; FillBuffer[]};
char ¬ tB[tI];
SELECT char
FROM
'> =>
SELECT state
FROM
$plain, $leftBrocket => state ¬ $rightBrocket;
$rightBrocket => {
state ¬ $plain; nest ¬ nest - 1; IF nest = 0 THEN EXIT};
ENDCASE;
'< =>
SELECT state
FROM
$plain, $rightBrocket => state ¬ $leftBrocket;
$leftBrocket => {state ¬ $plain; nest ¬ nest + 1};
ENDCASE;
ENDCASE => state ¬ $plain;
ENDLOOP;
NextChar[]};
ENDCASE => ScanError[$char, token.index]};
ENDCASE => {
token.class ¬ scanTab[char];
IF token.class # 0 THEN GO TO GetNext;
NextChar[];
ScanError[$char, token.index]};
REPEAT
GetNext => {IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]};
GotNext => NULL;
EndFile => {
token.class ¬ endMarker; token.index ¬ tOrigin + (tI-1); token.value ¬ MobP1.nullValue;
UNTIL tEnded DO FillBuffer[] ENDLOOP; -- flush stream
FillBuffer[]; char ¬ tB[tI]};
ENDLOOP;
nTokens ¬ nTokens + 1;
RETURN};
numerical conversion
Digit: ARRAY CHAR ['0..'9] OF [0..9] ~ [0,1,2,3,4,5,6,7,8,9];
character and string constants
escapeMark: CHAR ~ '\\;
Escape:
PROC
RETURNS[c:
CHAR, valid, advance:
BOOL¬TRUE] ~ {
c ¬ char;
IF c = escapeMark
THEN {
NextChar[];
SELECT char
FROM
'n, 'N => c ¬ Ascii.CR;
'r, 'R => c ¬ Ascii.CR;
'l, 'L => c ¬ Ascii.LF;
't, 'T => c ¬ Ascii.TAB;
'b, 'B => c ¬ Ascii.BS;
'f, 'F => c ¬ Ascii.FF;
'', '", escapeMark => c ¬ char;
IN ['0 .. '7] => {
nc, v: CARDINAL ¬ 0;
DO
IF ~(char IN ['0..'7]) THEN {valid ¬ advance ¬ FALSE; EXIT};
v ¬ 8*v + Digit[char];
IF (nc ¬ nc+1) = 3 THEN EXIT;
NextChar[];
ENDLOOP;
IF v > 377b THEN {valid ¬ FALSE; v ¬ 0};
c ¬ VAL[v]};
ENDCASE => valid ¬ advance ¬ FALSE};
RETURN};
initialization/finalization
ScanInit:
PUBLIC
PROC[
source: IO.STREAM,
logger: PROC [PROC [log: IO.STREAM]]] ~ {
stream ¬ source; Logger ¬ logger;
IF buffer = NIL THEN buffer ¬ NEW[TEXT[256]];
desc.base ¬ LOOPHOLE[buffer, LONG STRING]; desc.offset ¬ 0;
iMax ¬ buffer.length ¬ buffer.maxLength;
streamOrigin ¬ IO.GetIndex[stream];
tB ¬ NEW[TextBuffer];
tOrigin ¬ tLimit ¬ 0; tMax ¬ 0; tEnded ¬ FALSE;
FillBuffer[]; char ¬ tB[tI]; qDot ¬ FALSE;
nTokens ¬ nErrors ¬ 0};
ScanStats:
PUBLIC
PROC
RETURNS[
NAT,
NAT] ~ {
RETURN[nTokens, nErrors]};
ScanReset:
PUBLIC
PROC ~ {
IF buffer # NIL THEN FREE[@buffer];
IF tB # NIL THEN FREE[@tB];
desc.base ¬ NIL;
stream ¬ NIL; Logger ¬ NIL};
error handling
StreamIndex: TYPE ~ INT; -- FileStream.FileByteIndex
charsPerPage: CARDINAL ← VM.bytesPerPage/BYTES[CHAR];
so the sources can be the same in both worlds:
charsPerPage: CARDINAL ¬ VM.PagesForBytes[1]/BYTES[CHAR];
ResetScanIndex:
PUBLIC
PROC[index:
CARDINAL]
RETURNS[success:
BOOL] ~ {
IF ~(index
IN [tOrigin .. tLimit))
THEN {
page: CARDINAL ~ index/charsPerPage;
tOrigin ¬ tLimit ¬ page*charsPerPage;
tMax ¬ 0; tEnded ¬ FALSE;
IO.SetIndex[stream, streamOrigin + tOrigin];
FillBuffer[]};
tI ¬ index - tOrigin;
IF tI >= tMax THEN FillBuffer[]; char ¬ tB[tI]; RETURN[TRUE]};
ErrorCode: TYPE ~ {number, string, char, atom, escape};
ScanError: PROC[code: ErrorCode, tokenIndex: CARDINAL] ~ {
Inner:
PROC [log:
IO.
STREAM] ~ {
ErrorContext[log,
SELECT code
FROM
$number => "invalid number",
$string => "string unterminated or too long",
$char => "invalid character",
$atom => "invalid atom",
$escape => "invalid escape sequence",
ENDCASE => NIL,
tokenIndex];
WriteCR[log]};
nErrors ¬ nErrors + 1;
Logger[Inner]};
WriteCR:
PROC[stream:
IO.
STREAM] = {
stream.PutChar['\012]};
ErrorContext:
PUBLIC
PROC[
to: IO.STREAM, message: Rope.ROPE, tokenIndex: CARDINAL] ~ {
saveIndex: StreamIndex ~ IO.GetIndex[stream];
origin: StreamIndex ~ streamOrigin + tokenIndex;
start, lineIndex: StreamIndex ¬ origin;
char: CHAR;
n: [1..100];
FOR n
IN [1..100]
UNTIL lineIndex = 0
DO
lineIndex ¬ lineIndex - 1;
IO.SetIndex[stream, lineIndex];
char ¬ stream.GetChar[];
IF char = Ascii.CR OR char = Ascii.LF THEN EXIT;
start ¬ lineIndex;
ENDLOOP;
IO.SetIndex[stream, start];
FOR n
IN [1..100]
DO
char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT];
SELECT char
FROM
Ascii.CR, Ascii.LF, Ascii.ControlZ => EXIT;
ENDCASE => IO.PutChar[to, char];
ENDLOOP;
WriteCR[to];
IO.SetIndex[stream, start];
UNTIL
IO.GetIndex[stream] = origin
DO
char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT];
IO.PutChar[to, IF char = Ascii.TAB THEN '\t ELSE ' ];
ENDLOOP;
IO.PutF[to, "^ %g[%d]", IO.rope[message], IO.card[tokenIndex]];
WriteCR[to];
IO.SetIndex[stream, saveIndex]};
}.