-- file ProtoScanner.mesa
-- last modified by Satterthwaite, January 10, 1983 2:20 pm
-- derived from Compiler>Scanner.mesa

DIRECTORY
  Ascii: TYPE USING [BS, ControlZ, CR, FF, LF, NUL, TAB],
  CharIO: TYPE USING [PutChar, PutNumber, PutString],
  Environment: TYPE USING [charsPerWord, charsPerPage, wordsPerPage],
  FileStream: TYPE USING [FileByteIndex, EndOf, GetIndex, SetIndex],
  HashOps: TYPE USING [EnterString],
  P1: TYPE USING [Token, Value, nullValue],
  ParseTable: TYPE USING [
    HashIndex, HashTableRef, IndexTableRef, ScanTableRef, TableRef, VocabularyRef,
    endMarker, tokenID, tokenSTR],
  Stream: TYPE USING [Handle, GetBlock, GetChar],
  Strings: TYPE USING [String, SubStringDescriptor, AppendString];

Scanner: PROGRAM
    IMPORTS CharIO, FileStream, HashOps, Stream, Strings 
    EXPORTS P1 = {
  OPEN ParseTable;

-- table installation
 
  tablePtr: ParseTable.TableRef;
  hashTab: HashTableRef;
  scanTab: ScanTableRef;
  vocab: VocabularyRef;
  vocabIndex: IndexTableRef;

  InstallScanTable: PUBLIC PROC [base: ParseTable.TableRef] = {
    tablePtr ← base;
    hashTab ← @tablePtr[tablePtr.scanTable.hashTab];
    scanTab ← @tablePtr[tablePtr.scanTable.scanTab];
    vocab ← LOOPHOLE[@tablePtr[tablePtr.scanTable.vocabBody]];
    vocabIndex ← @tablePtr[tablePtr.scanTable.vocabIndex]};


-- scanner state
 
  stream: Stream.Handle ← NIL;		-- the input stream
  streamOrigin: FileStream.FileByteIndex;

  Logger: PROC [PROC [log: Stream.Handle]] ← NIL;
  
  zone: UNCOUNTED ZONE ← NIL;
  
  textPages: NAT ~ 6;
  textWords: NAT ~ textPages*Environment.wordsPerPage;
  textChars: NAT ~ textWords*Environment.charsPerWord;
  TextBuffer: TYPE ~ PACKED ARRAY [0..textChars) OF CHAR;
  
  tB: LONG POINTER TO TextBuffer;
  tI, tMax: [0..textChars];
  tOrigin, tLimit: CARDINAL;
  tEnded: BOOL;


  FillBuffer: PROC ~ {
    tOrigin ← tLimit;
    IF tEnded THEN tMax ← 0
    ELSE {
      tMax ← stream.GetBlock[[LOOPHOLE[tB], 0, textChars]].bytesTransferred;
      IF tMax < textChars THEN tEnded ← TRUE;
      tLimit ← tOrigin + tMax};
    IF tMax = 0 THEN {tB[0] ← Ascii.NUL; tMax ← 1};
    tI ← 0};


  buffer: Strings.String ← NIL;		-- token assembly area
  iMax: CARDINAL;			-- iMax = buffer.maxlength
  desc: Strings.SubStringDescriptor;	-- initial buffer segment

  nTokens: NAT;				-- token count
  nErrors: NAT;				-- lexical errors

  BufferOverflow: ERROR ~ CODE;

  ExpandBuffer: PROC ~ {
    oldBuffer: Strings.String ← buffer;
    IF oldBuffer.length > 2000 THEN ERROR BufferOverflow;
    buffer ← zone.NEW[StringBody[2*oldBuffer.length]];
    Strings.AppendString[buffer, oldBuffer];
    iMax ← buffer.length ← buffer.maxlength;
    zone.FREE[@oldBuffer];
    desc.base ← buffer};


  char: CHAR;		-- current (most recently scanned) character
  qDot: BOOL;		-- used to resolved decimal point vs. interval

  NextChar: PROC ~ {	-- also expanded inline within Atom
    IF (tI←tI+1) = tMax THEN FillBuffer[]; char ← tB[tI]};


  Atom: PUBLIC PROC RETURNS [token: P1.Token] ~ {
    OPEN token;
    DO
      WHILE char IN [Ascii.NUL..' ] DO
	SELECT char FROM
	  Ascii.NUL => {		-- ↑@↑@ is Tioga escape seq
	    IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
	    char ← tB[tI];
	    IF char = Ascii.NUL THEN GO TO EndFile};
	  Ascii.ControlZ =>		-- ↑Z is Bravo escape char
	    UNTIL char = Ascii.CR DO
	      IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
	      char ← tB[tI];
	      ENDLOOP;
	  ENDCASE => {
	    IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile; FillBuffer[]};
	    char ← tB[tI]};
	ENDLOOP;
      index ← tOrigin + tI;  value ← P1.nullValue;
      SELECT char FROM

	'a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j, 'k, 'l, 'm,
	'n, 'o, 'p, 'q, 'r, 's, 't, 'u, 'v, 'w, 'x, 'y, 'z  => {
	  i: CARDINAL ← 0;
	  DO
	    buffer[i] ← char;
	    IF (tI←tI+1) = tMax THEN FillBuffer[];
	    char ← tB[tI];
	    SELECT char FROM
	      IN ['a..'z], IN ['A..'Z], IN ['0..'9] =>
		IF (i ← i+1) >= iMax THEN ExpandBuffer[];
	      ENDCASE => EXIT;
	    ENDLOOP;
	  desc.length ← i+1;
	  class ← tokenID;  value.r ← HashOps.EnterString[@desc];
	  GO TO GotNext};

	'A, 'B, 'C, 'D, 'E, 'F, 'G, 'H, 'I, 'J, 'K, 'L, 'M,
	'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y, 'Z  => {
	  i: CARDINAL ← 0;
	  uId: BOOL ← TRUE;
	  first, last: NAT ← char.ORD;
	  DO
	    buffer[i] ← char;
	    IF (tI←tI+1) = tMax THEN FillBuffer[];
	    char ← tB[tI];
	    SELECT char FROM
	      IN ['A..'Z] => {
		last ← char.ORD; IF (i ← i+1) >= iMax THEN ExpandBuffer[]};
	      IN ['a..'z], IN ['0..'9] => {
		uId ← FALSE; IF (i ← i+1) >= iMax THEN ExpandBuffer[]};
	      ENDCASE => EXIT;
	    ENDLOOP;
	  i ← i+1;
	  IF uId THEN {
	    h: HashIndex ← ((first*128-first) + last) MOD HashIndex.LAST + 1;
	    j, s1, s2: CARDINAL;
	    WHILE (j ← hashTab[h].symbol) # 0 DO
	      IF vocabIndex[j]-(s2←vocabIndex[j-1]) = i THEN
		FOR s1 IN [0 .. i) DO
		  IF buffer[s1] # vocab.text[s2] THEN EXIT;
		  s2 ← s2+1;
		  REPEAT
		    FINISHED => {class ← j; GO TO GotNext};
		  ENDLOOP;
	      IF (h ← hashTab[h].link) = 0 THEN EXIT;
	      ENDLOOP};
	  desc.length ← i;
	  class ← tokenID;  value.r ← HashOps.EnterString[@desc];
	  GO TO GotNext};

	',, ';, ':, '←, '#, '~, '+, '*, '/, '↑, '@, '!,
	'=, '.,
	'(, '), '[, '], '{, '}  => {
	  class ← scanTab[char]; GO TO GetNext};

	'" => {
	  i: CARDINAL ← 0;
	  valid: BOOL;
	  advance: BOOL ← TRUE;
	  DO 
	    IF advance THEN {
	      IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EOFEnd; FillBuffer[]};
	      char ← tB[tI]};
	    SELECT char FROM
	      '" => {
		IF (tI←tI+1) = tMax THEN FillBuffer[];
		char ← tB[tI];
		IF char # '" THEN GO TO QuoteEnd};
	      ENDCASE;
	    IF i >= iMax THEN ExpandBuffer[
	        ! BufferOverflow => {ScanError[string, index]; i ← 0; CONTINUE}];
	    [buffer[i], valid, advance] ← Escape[];  i ← i+1;
	    IF ~valid THEN ScanError[$escape, tOrigin + tI];
	    REPEAT
	      QuoteEnd => NULL;
	      EOFEnd => {ScanError[$string, index]; FillBuffer[]; char ← tB[tI]};
	    ENDLOOP;
	  desc.length ← i;
	  value.r ← HashOps.EnterString[@desc];
	  class ← tokenSTR; GO TO GotNext};

	'- => {
	  NextChar[];
	  IF char # '- THEN {
	    class ← scanTab['-];
	    IF class = 0 THEN ScanError[char, index-1];
	    GO TO GotNext};
	  char ← Ascii.NUL;
	  DO
	    pChar: CHAR ~ char;
	    IF (tI←tI+1) = tMax THEN {IF tEnded THEN GO TO EndFile;  FillBuffer[]};
	    char ← tB[tI];
	    SELECT char FROM
	      '- => IF pChar = '- THEN EXIT;
	      Ascii.CR => EXIT;
	      ENDCASE;
	    ENDLOOP;
	  NextChar[]};

	'< => {
	  NextChar[];
	  SELECT char FROM
	    '< => {
	      state: {plain, leftBrocket, rightBrocket} ← $plain;
	      nest: CARDINAL ← 1;
	      DO
	        IF (tI←tI+1) = tMax THEN {
		  IF tEnded THEN GO TO EndFile; FillBuffer[]};
	        char ← tB[tI];
	        SELECT char FROM
	          '> => SELECT state FROM
		    $plain, $leftBrocket => state ← $rightBrocket;
		    $rightBrocket => {
		      state ← $plain; nest ← nest - 1; IF nest = 0 THEN EXIT};
		    ENDCASE;
	          '< => SELECT state FROM
		    $plain, $rightBrocket => state ← $leftBrocket;
		    $leftBrocket => {state ← $plain; nest ← nest + 1};
		    ENDCASE;
	          ENDCASE => state ← $plain;
	        ENDLOOP;
	      NextChar[]};
	    ENDCASE => ScanError[$char, index]};
	
	ENDCASE => {
	  class ← scanTab[char];
	  IF class # 0 THEN GO TO GetNext;
	  NextChar[];
	  ScanError[$char, index]};

      REPEAT
	GetNext => {IF (tI←tI+1) = tMax THEN FillBuffer[]; char ← tB[tI]};
	GotNext => NULL;
	EndFile => {
	  class ← endMarker;  index ← tOrigin + (tI-1);  value ← P1.nullValue;
	  UNTIL tEnded DO FillBuffer[] ENDLOOP;		-- flush stream
	  FillBuffer[];  char ← tB[tI]};
      ENDLOOP;
    nTokens ← nTokens + 1;
    RETURN};


 -- numerical conversion

  Digit: ARRAY CHAR ['0..'9] OF [0..9] ~ [0,1,2,3,4,5,6,7,8,9];


 -- character and string constants
 
  escapeMark: CHAR ~ '\\;
  
  Escape: PROC RETURNS [c: CHAR, valid, advance: BOOL←TRUE] ~ {
    c ← char;
    IF c = escapeMark THEN {
      NextChar[];
      SELECT char FROM
	'n, 'N => c ← Ascii.CR;
	'r, 'R => c ← Ascii.CR;
	'l, 'L => c ← Ascii.LF;
	't, 'T => c ← Ascii.TAB;
	'b, 'B => c ← Ascii.BS;
	'f, 'F => c ← Ascii.FF;
	'', '", escapeMark => c ← char;
	IN ['0 .. '7] => {
          nc, v: CARDINAL ← 0;
	  DO
	    IF ~(char IN ['0..'7]) THEN {valid ← advance ← FALSE; EXIT};
	    v ← 8*v + Digit[char];
	    IF (nc ← nc+1) = 3 THEN EXIT;
	    NextChar[];
	    ENDLOOP;
	  IF v > 377b THEN {valid ← FALSE; v ← 0};
	  c ← VAL[v]};
        ENDCASE => valid ← advance ← FALSE};
    RETURN};
    
     
 -- initialization/finalization

  ScanInit: PUBLIC PROC [
      source: Stream.Handle,
      scratchZone: UNCOUNTED ZONE,
      logger: PROC [PROC [log: Stream.Handle]]] ~ {
    zone ← scratchZone;
    stream ← source;  Logger ← logger;
    IF buffer = NIL THEN buffer ← zone.NEW[StringBody[256]];
    iMax ← buffer.length ← buffer.maxlength;
    desc.base ← buffer;  desc.offset ← 0;
    streamOrigin ← FileStream.GetIndex[stream];
    tB ← zone.NEW[TextBuffer];
    tOrigin ← tLimit ← 0;  tMax ← 0;  tEnded ← FALSE;
    FillBuffer[];  char ← tB[tI];  qDot ← FALSE;
    nTokens ← nErrors ← 0};

  ScanReset: PUBLIC PROC RETURNS [NAT, NAT] ~ {
    stream ← NIL;  Logger ← NIL;
    zone.FREE[@tB];
    IF buffer # NIL THEN zone.FREE[@buffer];
    zone ← NIL;
    RETURN [nTokens, nErrors]};


 -- error handling

  StreamIndex: TYPE ~ FileStream.FileByteIndex;

  ResetScanIndex: PUBLIC PROC [index: CARDINAL] RETURNS [success: BOOL] ~ {
    IF ~(index IN [tOrigin .. tLimit)) THEN {
      page: CARDINAL ~ index/Environment.charsPerPage;
      tOrigin ← tLimit ← page*Environment.charsPerPage;
      tMax ← 0;  tEnded ← FALSE;
      FileStream.SetIndex[stream, streamOrigin + tOrigin];
      FillBuffer[]};
    tI ← index - tOrigin;
    IF tI >= tMax THEN FillBuffer[];  char ← tB[tI]; RETURN [TRUE]};


  ErrorCode: TYPE ~ {number, string, char, atom, escape};
  
  ScanError: PROC [code: ErrorCode, tokenIndex: CARDINAL] ~ {

    Inner: PROC [log: Stream.Handle] ~ {
      ErrorContext[log,
	SELECT code FROM
	  $number => "invalid number"L,
	  $string => "string unterminated or too long"L,
	  $char => "invalid character"L,
	  $atom => "invalid atom"L,
	  $escape => "invalid escape sequence"L,
	  ENDCASE => NIL,
	tokenIndex];
      CharIO.PutChar[log, '\n]};
    
    nErrors ← nErrors + 1;
    Logger[Inner]};


  ErrorContext: PUBLIC PROC [
      to: Stream.Handle, message: Strings.String, tokenIndex: CARDINAL] ~ {
    OPEN CharIO;
    saveIndex: StreamIndex ~ FileStream.GetIndex[stream];
    origin: StreamIndex ~ streamOrigin + tokenIndex;
    start, lineIndex: StreamIndex ← origin;
    char: CHAR;
    n: [1..100];
    FOR n IN [1..100] UNTIL lineIndex = 0 DO
      lineIndex ← lineIndex - 1;
      FileStream.SetIndex[stream, lineIndex];
      IF stream.GetChar[] = Ascii.CR THEN EXIT;
      start ← lineIndex;
      ENDLOOP;
    FileStream.SetIndex[stream, start];
    FOR n IN [1..100] UNTIL FileStream.EndOf[stream] DO
      char ← stream.GetChar[];
      SELECT char FROM
	Ascii.CR, Ascii.ControlZ => EXIT;
	ENDCASE => PutChar[to, char];
      ENDLOOP;
    CharIO.PutChar[to, Ascii.CR];
    FileStream.SetIndex[stream, start];
    UNTIL FileStream.GetIndex[stream] = origin OR FileStream.EndOf[stream] DO
      char ← stream.GetChar[];
      PutChar[to, IF char = Ascii.TAB THEN '\t ELSE ' ];
      ENDLOOP;
    PutString[to, "↑ "L];  PutString[to, message];  PutString[to, " ["L];
    PutNumber[to, tokenIndex, [base~10, zerofill~FALSE, unsigned~TRUE, columns~0]];
    PutChar[to, ']];   CharIO.PutChar[to, '\n];
    FileStream.SetIndex[stream, saveIndex]};

  }.