-- September 9, 1982 4:13 pm
-- Lexer.mesa

-- Simple lexer for alpha-numeric identifiers, unsigned ints, unsigned reals w/o trailing E's,
-- single and double-character operators, and quoted strings with no funny business 
-- embedded in them.  No two-line strings, either.  Returns lexeme as a REF INT, REF REAL,
-- ATOM, or ROPE. 

-- end-of-file in mid-lexeme is an error: the last char in the stream should be of type blank.
-- You can guarantee this with IO.AppendStreams if you can't guarantee it any other way.

DIRECTORY Atom, Rope, IO;

Lexer: DEFINITIONS
= BEGIN

Handle: TYPE = REF HandleRec;

HandleRec: TYPE = RECORD
 [error: Rope.ROPE,	-- initially NIL, set to error message on lexical error
  eof: BOOL,         -- initially FALSE, set to TRUE on end of stream.
  a: REF ANY,       -- contains last lexeme returned by Lex
  buf: REF TEXT,   -- contains the string that lexed to a
  in: IO.STREAM,    -- Source of characters to be broken into lexemes.
  type: ARRAY CHAR OF CharType,
  						--  described below
  opList: LIST OF OpRec] ; -- list of two-character lexemes

NewHandle: PROC RETURNS [h: Handle];
  -- All of h's fields are initialized except h.in
  -- h.type is initialized by DefaultCharTypes below
  -- To read lexemes, do h ← NewHandle[]; h.in ← <some input source>, then
  -- call Lex repeatedly.

CharType: TYPE = {letter, digit, blank, quote, op};

AddOpPair: PUBLIC PROC [h: Handle, c1, c2: CHAR];
  -- makes c1c2 into a double character lexeme; e.g. -> and ..
  -- both chars must be of type op

Lex: PUBLIC PROC[h: Handle];
  -- sets h.a to next lexeme from stream h.in, 
  -- OR sets h.eof to TRUE and h.a to NIL,
  -- OR sets h.error to a non-nil message and h.a to NIL;
  -- In the last case (error = TRUE), the characters removed
  -- from h.in are put back, so you can reread them.

-- The char types in DefaultTypeArray are set by the following procedure:
 
OpRec: TYPE = RECORD [opname: RECORD [CHAR, CHAR], op: ATOM];

DefaultCharTypes: PUBLIC PROC[h: Handle];
-- sets h.type as follows:
-- letters to letter; digits to digit; '" to quote; cr, sp, lf, ff, nul, and tab to blank;
-- and all other to op

END.