<<>> <Scanner.mesa>> <> <> <> <> <> <> <> <<>> DIRECTORY Ascii USING [BS, ControlZ, CR, FF, LF, NUL, TAB], ConvertUnsafe USING [SubString], IO USING [card, EndOfStream, GetChar, GetIndex, int, Put1, PutChar, PutF, rope, SetIndex, STREAM, UnsafeGetBlock], MobHashOps USING [EnterString], MobP1 USING [Token, Value, nullValue], MobParseTable USING [endMarker, HashIndex, HashTable, HashTableRef, IndexTable, IndexTableRef, InitHashTable, InitIndexTable, InitScanTable, InitVocabulary, ScanTableRef, tokenID, tokenSTR, TSymbol, VocabularyRef], RefText USING [Append], Rope USING [ROPE], VM USING [bytesPerPage, logBytesPerPage, PageCount, PagesForBytes]; MobScanner: PROGRAM IMPORTS MobHashOps, IO, RefText, MobParseTable, VM EXPORTS MobP1 = { OPEN MobParseTable; <> hashTab: HashTableRef; scanTab: ScanTableRef; vocab: VocabularyRef; vocabIndex: IndexTableRef; InstallScanTable: PUBLIC PROC[] = { hashTab ¬ MobParseTable.InitHashTable[]; scanTab ¬ MobParseTable.InitScanTable[]; vocab ¬ MobParseTable.InitVocabulary[]; vocabIndex ¬ MobParseTable.InitIndexTable[]}; TypeSym: PUBLIC PROC[log: IO.STREAM, sym: TSymbol] = { log.PutChar[' ]; IF sym IN [1..endMarker) THEN FOR i: NAT IN [vocabIndex[sym-1]..vocabIndex[sym]) DO log.PutChar[LOOPHOLE[vocab, LONG STRING][i]] ENDLOOP ELSE log.Put1[IO.int[sym]]}; <> stream: IO.STREAM ¬ NIL; -- the input stream streamOrigin: StreamIndex; Logger: PROC[PROC [log: IO.STREAM]] ¬ NIL; textChars: NAT ~ 4096; TextBuffer: TYPE ~ PACKED ARRAY [0..textChars) OF CHAR; tB: REF TextBuffer; tI, tMax: [0..textChars]; tOrigin, tLimit: CARDINAL ¬ 0; tEnded: BOOL ¬ FALSE; FillBuffer: PROC ~ { tOrigin ¬ tLimit; IF tEnded THEN tMax ¬ 0 ELSE { tMax ¬ stream.UnsafeGetBlock[[LOOPHOLE[tB], 0, textChars]].nBytesRead; IF tMax < textChars THEN tEnded ¬ TRUE; tLimit ¬ tOrigin + tMax}; IF tMax = 0 THEN {tB[0] ¬ Ascii.NUL; tMax ¬ 1}; tI ¬ 0}; buffer: REF TEXT ¬ NIL; -- token assembly area iMax: CARDINAL; -- iMax = buffer.maxLength desc: ConvertUnsafe.SubString; -- initial buffer segment nTokens: NAT ¬ 0; -- token count nErrors: NAT; -- lexical errors BufferOverflow: ERROR ~ CODE; ExpandBuffer: PROC ~ { oldBuffer: REF TEXT ¬ buffer; IF oldBuffer.length > 2000 THEN ERROR BufferOverflow; buffer ¬ NEW[TEXT[2*oldBuffer.length]]; desc.base ¬ LOOPHOLE[buffer, LONG STRING]; buffer ¬ RefText.Append[buffer, oldBuffer]; iMax ¬ buffer.length ¬ buffer.maxLength; oldBuffer ¬ NIL}; char: CHAR; -- current (most recently scanned) character qDot: BOOL; -- used to resolved decimal point vs. interval NextChar: PROC ~ { -- also expanded inline within Atom IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]}; NextToken: PUBLIC PROC RETURNS[token: MobP1.Token] ~ { --OPEN token; DO WHILE char IN [Ascii.NUL..' ] DO SELECT char FROM Ascii.NUL => { -- ­@­@ is Tioga escape seq IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]}; char ¬ tB[tI]; IF char = Ascii.NUL THEN GO TO EndFile}; Ascii.ControlZ => -- ­Z is Bravo escape char UNTIL char = Ascii.CR OR char = Ascii.LF DO IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]}; char ¬ tB[tI]; ENDLOOP; ENDCASE => { IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]}; char ¬ tB[tI]}; ENDLOOP; token.index ¬ tOrigin + tI; token.value ¬ MobP1.nullValue; SELECT char FROM 'a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j, 'k, 'l, 'm, 'n, 'o, 'p, 'q, 'r, 's, 't, 'u, 'v, 'w, 'x, 'y, 'z => { i: CARDINAL ¬ 0; DO buffer[i] ¬ char; IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]; SELECT char FROM IN ['a..'z], IN ['A..'Z], IN ['0..'9] => IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]; ENDCASE => EXIT; ENDLOOP; desc.length ¬ i+1; token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]]; GO TO GotNext}; 'A, 'B, 'C, 'D, 'E, 'F, 'G, 'H, 'I, 'J, 'K, 'L, 'M, 'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y, 'Z => { i: CARDINAL ¬ 0; uId: BOOL ¬ TRUE; first, last: NAT ¬ char.ORD; DO buffer[i] ¬ char; IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]; SELECT char FROM IN ['A..'Z] => { last ¬ char.ORD; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]}; IN ['a..'z], IN ['0..'9] => { uId ¬ FALSE; IF (i ¬ i+1) >= iMax THEN ExpandBuffer[]}; ENDCASE => EXIT; ENDLOOP; i ¬ i+1; IF uId THEN { h: HashIndex ¬ ((first*128-first) + last) MOD HashIndex.LAST + 1; j, s1, s2: CARDINAL; WHILE (j ¬ hashTab[h].symbol) # 0 DO IF vocabIndex[j]-(s2¬vocabIndex[j-1]) = i THEN FOR s1 IN [0 .. i) DO IF buffer[s1] # vocab.text[s2] THEN EXIT; s2 ¬ s2+1; REPEAT FINISHED => {token.class ¬ j; GO TO GotNext}; ENDLOOP; IF (h ¬ hashTab[h].link) = 0 THEN EXIT; ENDLOOP}; desc.length ¬ i; token.class ¬ tokenID; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]]; GO TO GotNext}; ',, ';, ':, '_, '#, '~, '+, '*, '/, '^, '@, '!, '=, '., '(, '), '[, '], '{, '} => { token.class ¬ scanTab[char]; GO TO GetNext}; '" => { i: CARDINAL ¬ 0; valid: BOOL; advance: BOOL ¬ TRUE; DO IF advance THEN { IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EOFEnd; FillBuffer[]}; char ¬ tB[tI]}; SELECT char FROM '" => { IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]; IF char # '" THEN GO TO QuoteEnd}; ENDCASE; IF i >= iMax THEN ExpandBuffer[ ! BufferOverflow => {ScanError[string, token.index]; i ¬ 0; CONTINUE}]; [buffer[i], valid, advance] ¬ Escape[]; i ¬ i+1; IF ~valid THEN ScanError[$escape, tOrigin + tI]; REPEAT QuoteEnd => NULL; EOFEnd => {ScanError[$string, token.index]; FillBuffer[]; char ¬ tB[tI]}; ENDLOOP; desc.length ¬ i; token.value.r ¬ LOOPHOLE[MobHashOps.EnterString[desc]]; token.class ¬ tokenSTR; GO TO GotNext}; '- => { NextChar[]; IF char # '- THEN { token.class ¬ scanTab['-]; IF token.class = 0 THEN ScanError[char, token.index-1]; GO TO GotNext}; char ¬ Ascii.NUL; DO pChar: CHAR ~ char; IF (tI¬tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]}; char ¬ tB[tI]; SELECT char FROM '- => IF pChar = '- THEN EXIT; Ascii.CR, Ascii.LF => EXIT; ENDCASE; ENDLOOP; NextChar[]}; '< => { NextChar[]; SELECT char FROM '< => { state: {plain, leftBrocket, rightBrocket} ¬ $plain; nest: CARDINAL ¬ 1; DO IF (tI¬tI+1) = tMax THEN { IF tEnded THEN GO TO EndFile; FillBuffer[]}; char ¬ tB[tI]; SELECT char FROM '> => SELECT state FROM $plain, $leftBrocket => state ¬ $rightBrocket; $rightBrocket => { state ¬ $plain; nest ¬ nest - 1; IF nest = 0 THEN EXIT}; ENDCASE; '< => SELECT state FROM $plain, $rightBrocket => state ¬ $leftBrocket; $leftBrocket => {state ¬ $plain; nest ¬ nest + 1}; ENDCASE; ENDCASE => state ¬ $plain; ENDLOOP; NextChar[]}; ENDCASE => ScanError[$char, token.index]}; ENDCASE => { token.class ¬ scanTab[char]; IF token.class # 0 THEN GO TO GetNext; NextChar[]; ScanError[$char, token.index]}; REPEAT GetNext => {IF (tI¬tI+1) = tMax THEN FillBuffer[]; char ¬ tB[tI]}; GotNext => NULL; EndFile => { token.class ¬ endMarker; token.index ¬ tOrigin + (tI-1); token.value ¬ MobP1.nullValue; UNTIL tEnded DO FillBuffer[] ENDLOOP; -- flush stream FillBuffer[]; char ¬ tB[tI]}; ENDLOOP; nTokens ¬ nTokens + 1; RETURN}; <> Digit: ARRAY CHAR ['0..'9] OF [0..9] ~ [0,1,2,3,4,5,6,7,8,9]; <> escapeMark: CHAR ~ '\\; Escape: PROC RETURNS[c: CHAR, valid, advance: BOOL¬TRUE] ~ { c ¬ char; IF c = escapeMark THEN { NextChar[]; SELECT char FROM 'n, 'N => c ¬ Ascii.CR; 'r, 'R => c ¬ Ascii.CR; 'l, 'L => c ¬ Ascii.LF; 't, 'T => c ¬ Ascii.TAB; 'b, 'B => c ¬ Ascii.BS; 'f, 'F => c ¬ Ascii.FF; '', '", escapeMark => c ¬ char; IN ['0 .. '7] => { nc, v: CARDINAL ¬ 0; DO IF ~(char IN ['0..'7]) THEN {valid ¬ advance ¬ FALSE; EXIT}; v ¬ 8*v + Digit[char]; IF (nc ¬ nc+1) = 3 THEN EXIT; NextChar[]; ENDLOOP; IF v > 377b THEN {valid ¬ FALSE; v ¬ 0}; c ¬ VAL[v]}; ENDCASE => valid ¬ advance ¬ FALSE}; RETURN}; <> ScanInit: PUBLIC PROC[ source: IO.STREAM, logger: PROC [PROC [log: IO.STREAM]]] ~ { stream ¬ source; Logger ¬ logger; IF buffer = NIL THEN buffer ¬ NEW[TEXT[256]]; desc.base ¬ LOOPHOLE[buffer, LONG STRING]; desc.offset ¬ 0; iMax ¬ buffer.length ¬ buffer.maxLength; streamOrigin ¬ IO.GetIndex[stream]; tB ¬ NEW[TextBuffer]; tOrigin ¬ tLimit ¬ 0; tMax ¬ 0; tEnded ¬ FALSE; FillBuffer[]; char ¬ tB[tI]; qDot ¬ FALSE; nTokens ¬ nErrors ¬ 0}; ScanStats: PUBLIC PROC RETURNS[NAT, NAT] ~ { RETURN[nTokens, nErrors]}; ScanReset: PUBLIC PROC ~ { IF buffer # NIL THEN FREE[@buffer]; IF tB # NIL THEN FREE[@tB]; desc.base ¬ NIL; stream ¬ NIL; Logger ¬ NIL}; <> StreamIndex: TYPE ~ INT; -- FileStream.FileByteIndex <> <> charsPerPage: CARDINAL ¬ VM.PagesForBytes[1]/BYTES[CHAR]; ResetScanIndex: PUBLIC PROC[index: CARDINAL] RETURNS[success: BOOL] ~ { IF ~(index IN [tOrigin .. tLimit)) THEN { page: CARDINAL ~ index/charsPerPage; tOrigin ¬ tLimit ¬ page*charsPerPage; tMax ¬ 0; tEnded ¬ FALSE; IO.SetIndex[stream, streamOrigin + tOrigin]; FillBuffer[]}; tI ¬ index - tOrigin; IF tI >= tMax THEN FillBuffer[]; char ¬ tB[tI]; RETURN[TRUE]}; ErrorCode: TYPE ~ {number, string, char, atom, escape}; ScanError: PROC[code: ErrorCode, tokenIndex: CARDINAL] ~ { Inner: PROC [log: IO.STREAM] ~ { ErrorContext[log, SELECT code FROM $number => "invalid number", $string => "string unterminated or too long", $char => "invalid character", $atom => "invalid atom", $escape => "invalid escape sequence", ENDCASE => NIL, tokenIndex]; WriteCR[log]}; nErrors ¬ nErrors + 1; Logger[Inner]}; WriteCR: PROC[stream: IO.STREAM] = { stream.PutChar['\012]}; ErrorContext: PUBLIC PROC[ to: IO.STREAM, message: Rope.ROPE, tokenIndex: CARDINAL] ~ { saveIndex: StreamIndex ~ IO.GetIndex[stream]; origin: StreamIndex ~ streamOrigin + tokenIndex; start, lineIndex: StreamIndex ¬ origin; char: CHAR; n: [1..100]; FOR n IN [1..100] UNTIL lineIndex = 0 DO lineIndex ¬ lineIndex - 1; IO.SetIndex[stream, lineIndex]; char ¬ stream.GetChar[]; IF char = Ascii.CR OR char = Ascii.LF THEN EXIT; start ¬ lineIndex; ENDLOOP; IO.SetIndex[stream, start]; FOR n IN [1..100] DO char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT]; SELECT char FROM Ascii.CR, Ascii.LF, Ascii.ControlZ => EXIT; ENDCASE => IO.PutChar[to, char]; ENDLOOP; WriteCR[to]; IO.SetIndex[stream, start]; UNTIL IO.GetIndex[stream] = origin DO char ¬ stream.GetChar[ ! IO.EndOfStream => EXIT]; IO.PutChar[to, IF char = Ascii.TAB THEN '\t ELSE ' ]; ENDLOOP; IO.PutF[to, "^ %g[%d]", IO.rope[message], IO.card[tokenIndex]]; WriteCR[to]; IO.SetIndex[stream, saveIndex]}; }.