HerculesParserImpl.mesa (was LexerImpl + ParseTableImpl + ParserImpl)

Last Edited by: Stolfi, February 22, 1984 8:29 am

Copied from ParserImpl.mesa
Coded by GNelson (?), October 18, 1982 10:05 pm (was ParserImpl.mesa)
Last Edited by: GNelson, January 10, 1983 5:53 pm

Merged with Parser.mesa
Coded by GNelson (?), September 9, 1982 5:57 pm
Last Edited by: GNelson, January 10, 1983 5:50 pm

Merged with ParseTableImpl.mesa
Last Edited by GNelson, March 1, 1983 6:11 pm (was ParseTableImpl.mesa)

Merged with ParseTable.mesa
Coded by GNelson (?) September 6, 1982 12:25 am
Last Edited by: GNelson, August 24, 1983 10:30 pm

Merged with LexerImpl.mesa
Coded by GNelson (?), December 7, 1982 3:26 pm

Merged with Lexer.mesa
Coded by by GNelson (?), September 9, 1982 4:13 pm

DIRECTORY
HerculesAlgebra USING [Se, Car, Cdr],
HerculesParseUnparse,
Atom USING [MakeAtom, PutProp, GetProp, MakeAtomFromChar],
IO USING [CR, LF, SP, NUL, FF, TAB],
Rope USING [ROPE, Cat, Fetch, Substr, FromChar, Length];
HerculesParserImpl: MONITOR
IMPORTS
Rope, Atom, HerculesAlgebra
EXPORTS
HerculesParseUnparse =
BEGIN OPEN HerculesParseUnparse, Alg: HerculesAlgebra;
ROPE: TYPE = Rope.ROPE;

- - - - - PARSE/UNPARSE TABLES

NewSyntax: PUBLIC PROC RETURNS [syntax: Syntax] =

BEGIN
syntax ← NEW [SyntaxRec ←
[maxAtomLength: 200,
maxStringLength: 200,
chType: ALL [op],
twoCharOps: NIL,

defaultAtomProps:
[prebreak: TRUE,
prearg: TRUE,
postbreak: TRUE,
postarg: TRUE,
alpha: TRUE],

defaultLitProps:
[prebreak: TRUE,
prearg: TRUE,
postbreak: TRUE,
postarg: TRUE]
]];

SetDefaultCharTypes[syntax]
END;

SetCharType: PUBLIC PROC[syntax: Syntax, from, to: CHAR, chType: CharType] =

BEGIN
FOR c: CHAR IN [from..to] DO syntax.chType[c] ← chType ENDLOOP
END;

SetDefaultCharTypes: PUBLIC PROC[syntax: Syntax] =

BEGIN OPEN syntax;
ops: Rope.ROPE = "+-*/|\\()[]{}<>=~@#$%&^←:;,.!?";
FOR c: CHAR IN [0C .. 255C] DO chType[c] ← other ENDLOOP;
FOR c: CHAR IN ['a .. 'z] DO chType[c] ← letter ENDLOOP;
FOR c: CHAR IN ['A .. 'Z] DO chType[c] ← letter ENDLOOP;
FOR c: CHAR IN ['0 .. '9] DO chType[c] ← digit ENDLOOP;
FOR i: INT IN [0..ops.Length[]) DO chType[ops.Fetch[i]] ← op ENDLOOP;
chType['"] ← quote;
chType[IO.CR] ← blank;
chType[IO.SP] ← blank;
chType[IO.LF] ← blank;
chType[IO.FF] ← blank;
chType[IO.NUL] ← blank;
chType[IO.TAB] ← blank
END;

EnterTwoCharOp: PUBLIC PROC [syntax: Syntax, c1, c2: CHAR] =

BEGIN
atom: ATOM = Atom.MakeAtom[Rope.Cat[Rope.FromChar[c1], Rope.FromChar[c2]]];
syntax.twoCharOps ← CONS[[chars: [c1, c2], atom: atom], syntax.twoCharOps]
END;

EnterAlias: PUBLIC ENTRY PROC [syntax: Syntax, alias, standard: ATOM] =

BEGIN
Atom.PutProp[alias, syntax, standard]
END;

EnterAtomProps: PUBLIC ENTRY PROC [syntax: Syntax, name: ATOM, props: Props] =

BEGIN

-- openfix operators must be just prebreak
IF props.openfix AND
(props.closefix OR props.prefix OR props.prearg OR NOT props.prebreak) THEN ERROR;

-- closefix operators must be just postbreak
IF props.closefix AND
(props.openfix OR props.postfix OR props.postarg OR NOT props.postbreak) THEN ERROR;

-- The next test prevents ambiguous unparsing:
IF props.prefix AND (props.prearg OR props.prebreak) AND
props.postfix AND (props.postarg OR props.postbreak) THEN ERROR;

Atom.PutProp[name, syntax, NEW [Props ← props]]
END;

- - - - - LEXER

Simple lexer for alpha-numeric identifiers, unsigned ints, unsigned reals w/o trailing E's, single and double-character operators, and quoted strings with no funny business embedded in them. No two-line strings, either.

Lexeme: TYPE = Se; -- Lexical item (REF INT, REF REAL, ATOM, or ROPE)

Lexer: TYPE = REF LexerRec;

Stream: TYPE = RECORD [pos: INT, rope: ROPE, len: INT]; -- fake stream

LexerRec: TYPE = RECORD
[syntax: Syntax, -- Syntax tables
stream: Stream, -- Source of characters to be broken into lexemes.

lex: Lexeme, -- current lexeme, the result of the most recent call to Lex
eof: BOOL, -- set by Lex when lexeme was not found.
buf: ROPE, -- contains the substring that lexed to lex
error: Rope.ROPE] ; -- initially NIL, set to error message on lexical error

NewLexer: PROC [syntax: Syntax] RETURNS [lexer: Lexer] =
-- creates a new Lexer with empty input stream (eof = TRUE)
-- All of lexer's fields are initialized to default values.
-- The lexer.chType is initialized by SetDefaultCharTypes below.
-- To use the lexer,
-- 1) set up lexer.chType, and call
EnterTwoCharOp, EnterAlias, and SetMaxLengths
-- as needed, to define the lexeme syntax
-- 2) attach a stream to the lexer by using
StartLexer
-- 3) execute
WHILE NOT lexer.eof DO <use lexer.lex>; Lex[lexer] ENDLOOP.
-- The same lexer can be reused for as many streams as needed; the
-- lexical tables and aliases need to be set up just once.

BEGIN
lexer ← NEW [LexerRec ←
[syntax: syntax,
stream: [0, NIL, 0],
lex: NIL,
eof: TRUE,
buf: NIL,
error: NIL]]
END;

StartLexer: PROC [lexer: Lexer, rope: ROPE] =
-- Attaches the specified stream to the lexer, and puts the first lexeme into lexer.lex
-- (lexer.eof and lexer.error have same meaning as for Lex[]).
-- The lexeme syntax and aliases must have been set up before calling StartLexer.
-- Further lexemes are obtained by calling Lex[lexer] (see below)

BEGIN
lexer.stream ← [0, rope, rope.Length[]];
lexer.eof ← FALSE;
Lex[lexer]
END;

Lex: PUBLIC PROC[lexer: Lexer] =
-- Skips any blanks from stream lexer.stream, and parses the longest prefix
-- of the same that is a valid lexeme, returning it in lexer.lex
-- If the stream is exausted or contains only blanks, then
-- returns lexer.lex = NIL, lexer.eof = TRUE.
-- If the stream is not exhausted, but no prefix of it is a valid lexeme,
-- returns lexer.lex = NIL, lexer.error = some non-nil message. In
-- this case, the characters removed from lexer.stream are put back, so
-- you can reread them.

BEGIN OPEN lexer;

EndOfStream: PROC RETURNS [BOOL] = INLINE
-- returns true if next char in lexer.stream is c

BEGIN
RETURN [stream.pos >= stream.len]
END;

CurChar: PROC RETURNS [c: CHAR] = INLINE
-- returns next char in lexer.stream

BEGIN
RETURN [stream.rope.Fetch[stream.pos]]
END;

SeeChar: PROC [c: CHAR] RETURNS [BOOL] = INLINE
-- returns true if next char in lexer.stream is c

BEGIN
RETURN [NOT EndOfStream[] AND CurChar[] = c]
END;

SeeType: PROC [t: CharType] RETURNS [BOOL] = INLINE
-- returns true if next char in lexer.stream is of type t

BEGIN
RETURN [NOT EndOfStream[] AND syntax.chType[CurChar[]] = t]
END;

Skip: PROC = INLINE
-- advances lexer.stream.

BEGIN
stream.pos ← stream.pos+1
END;

first: INT; -- position of first char of current lexeme in lexer.stream

NextID: PROC =

BEGIN
Skip[];
WHILE SeeType[letter] OR SeeType[digit] DO
IF stream.pos-first >= syntax.maxAtomLength THEN
{error ← "identifier too long"; RETURN};
Skip[]
ENDLOOP;
lex ← MapAlias
[syntax, Atom.MakeAtom[Rope.Substr[stream.rope, first, stream.pos-first]]];
END;

NextNumber: PROC =

BEGIN
d: INT;
n: INT ← CurChar[] - '0; Skip[];
WHILE SeeType[digit] DO
d ← CurChar[] - '0;
IF n > (LAST[INT] - d)/10 THEN
{error ← "number too big"; RETURN};
n ← n * 10 + d;
Skip[]
ENDLOOP;
IF SeeChar['.] THEN -- get decimal part
{x: REAL ← n;
y: REAL ← 0.1;
Skip[];
WHILE SeeType[digit] DO
x ← x + y * (CurChar[] - '0);
y ← y/10;
Skip[]
ENDLOOP;
lex ← NEW[REAL ← x]}
ELSE
{lex ← NEW[INT ← n]}
END;

NextOp: PROC =

BEGIN
c1: CHAR ← CurChar[]; Skip[];
-- Check if it is a two-character op
IF SeeType[op] THEN
{c2: CHAR = CurChar[];
FOR ops: LIST OF TwoCharOp ← syntax.twoCharOps, ops.rest UNTIL ops = NIL DO
IF ops.first.chars = [c1, c2] THEN
{lex ← MapAlias[syntax, ops.first.atom]; Skip[]; RETURN}
ENDLOOP};
-- Nope - single char op
lex ← MapAlias[syntax, Atom.MakeAtomFromChar[c1]]
END;

NextString: PROC =

BEGIN
delim: CHAR ← CurChar[] ; Skip[];
WHILE NOT EndOfStream[] AND CurChar[] # delim DO
IF CurChar[] = IO.CR OR CurChar[] = IO.LF
OR SeeType[invalid] THEN
{error ← "invalid character in string"; RETURN};
IF stream.pos-first >= syntax.maxStringLength THEN
{error ← "string too long"; RETURN};
Skip[]
ENDLOOP;
IF NOT SeeChar[delim] THEN
{error ← "runaway string"; RETURN};
lex ← Rope.Substr[stream.rope, first+1, stream.pos-first-1];
Skip[]
END;

buf ← NIL;
error ← NIL;
lex ← NIL;
-- Skip blanks
WHILE SeeType[blank] DO stream.pos←stream.pos+1 ENDLOOP;
-- Decide type of next token
IF EndOfStream[] THEN
{lexer.eof ← TRUE; RETURN};
first ← stream.pos;
SELECT syntax.chType[CurChar[]] FROM
letter => NextID[];
digit => NextNumber[];
op => NextOp[];
quote => NextString[];
ENDCASE => {error ← "invalid character"};
IF error # NIL THEN -- backup stream to beginning of parsed string
{stream.pos ← first}
ELSE
{buf ← Rope.Substr[stream.rope, first, stream.pos-first]}

END
;

MapAlias: PUBLIC ENTRY PROC [syntax: Syntax, atom: ATOM] RETURNS [standard: ATOM] =

BEGIN
pref: REF ← Atom.GetProp[atom, syntax];
RETURN [IF pref = NIL THEN atom ELSE NARROW[pref]]
END;

- - - - - PARSER

Simple operator precedence parser for operators of the following binding types: infix, prefix, postfix, selfix (i.e., variables or literals), nullfix ('noise' lexemes to be ignored) openfix (like "begin" or left parenthesis), and closefix (like "end" or right parenthesis).

Parser: TYPE = REF ParserRec;

ParserRec: TYPE = RECORD
[syntax: Syntax,
lexer: Lexer, -- Source of lexemes to be parsed.
lexProps: Props, -- of lexer.lex (if lexer.eof or lexer.error then [postbreak: TRUE].)

tree: Se, -- contains last thing produced by Parse.
eof: BOOL, -- set to TRUE when there is nothing more to parse.
error: Rope.ROPE -- set to error message on lexical error
];

NewParser: PROC [syntax: Syntax] RETURNS [parser: Parser] =
-- Creates a new Parser with empty lexeme stream (eof = TRUE)
-- All of parser's fields are initialized to default values.
-- To use the lexer,
-- 1) call
EnterAtomProps as needed, to define operator types and precedences
-- 2) set up a Lexer, attach some character stream to it (using StartLexer),
-- and attach that Lexer to the parser by using
StartParser
-- 3) execute
WHILE NOT parser.eof DO <use parser.tree>; Parse[parser] ENDLOOP.
-- The same parser can be reused for as many lexeme streams as needed; the
-- operator properties need to be defined just once.

BEGIN
parser ← NEW[ParserRec ←
[syntax: syntax,
lexer: NIL,
lexProps: [postbreak: TRUE],
tree: NIL,
eof: TRUE,
error: NIL
]]
END;

GetProps: PUBLIC PROC [lex: Se, syntax: Syntax] RETURNS [Props] =
-- returns properties of atom or literal (explicitly defined or default

BEGIN
IF ISTYPE [lex, ATOM] THEN
{r: REF ANY ← Atom.GetProp[NARROW[lex], syntax];
RETURN [IF r # NIL THEN NARROW[r, REF Props]^ ELSE syntax.defaultAtomProps]}
ELSE
RETURN [syntax.defaultLitProps]
END;

GetLexProps: PROC [parser: Parser] =

BEGIN
IF parser.lexer.eof OR parser.lexer.error # NIL THEN
{parser.lexProps ← [postbreak: TRUE]}
ELSE
{parser.lexProps ← GetProps[parser.lexer.lex, parser.syntax]}
END;

OpenSe: TYPE = LIST OF Se; -- last cell of an incomplete tree node; cdr points to root

PrefixRec: TYPE = RECORD
[op: OpenSe, -- prefix (or infix with left argument in place)
power: BindingPower -- binding power
];

StackPtr: TYPE = LIST OF PrefixRec;

stackAvail: StackPtr ← NIL;

Push: ENTRY PROC [data: PrefixRec, stack: StackPtr] RETURNS [new: StackPtr] =

BEGIN
IF stackAvail = NIL THEN
{new ← CONS[data, stack]}
ELSE
{new ← stackAvail; stackAvail ← stackAvail.rest;
new.rest ← stack; new.first ← data}
END;

Pop: ENTRY PROC [stack: StackPtr] RETURNS [data: PrefixRec, new: StackPtr] =

BEGIN
data ← stack.first; new ← stack.rest;
stack.rest ← stackAvail; stackAvail ← stack
END;

AddOperand: PROC [op: OpenSe, arg: Se] RETURNS [new: OpenSe] = INLINE

{new ← CONS [arg, op.rest]; op.rest ← new};

CloseSe: PROC [op: OpenSe] RETURNS [expr: Se] = INLINE

{expr ← op.rest; op.rest ← NIL};

ApplySe: PROC [op1: OpenSe, arg: Se] RETURNS [new: Se] = INLINE

{new ← CloseSe[AddOperand [op1, arg]]};

ComposeSe: PROC [op1, op2: OpenSe] RETURNS [new: OpenSe] = INLINE

{new ← AddOperand [op1, op2];
op2.rest ← op1.rest; op1.rest ← NIL; new ← op2.rest};

missing: PUBLIC Se ← Atom.MakeAtom["<<MISSING>>"];

Parse: PUBLIC PROC [rope: ROPE, syntax: Syntax]
RETURNS [expr: Se, error: ROPE, rest: ROPE]=

BEGIN

lexer: Lexer ← NewLexer [syntax];

parser: Parser ← NewParser[syntax];

DoParse
: PROC =
-- Parses lexeme stream until next break, returns result in parser.tree as
-- closed expression.
-- In case of parsing error, sets parser.error # NIL. The parser.tree
-- will be the parsed piece (as a closed Se); it may be NIL, or some operators
-- in it may have NILs as operands.

BEGIN

Bond: TYPE = RECORD -- bond between two atomic sub-expressions A, B
[type: BondType, power: BindingPower ← 0];

BondType: TYPE = {pre, post, break};
-- pre: A applies to B,
-- post: B applies to A,
-- break: neither (expression being parsed ends after A)

expr: Se ← NIL; -- Last atomic sub-expr (lexeme or bracketed expression) parsed.
openfix: BOOLFALSE; -- set by GetNextLexeme if expr is openfix
bond: Bond ← [break, 0]; -- bond between expr and next atomic sub-expression.  

GetNextLexeme: PROC =

BEGIN
-- Sets expr ← current lexeme, and gets the next one into lexer.lex
-- Sets bond ← bond between expr and next sub-expression.
-- In case of error, returns either
-- expr = <missing>, bond = break,
-- or expr = lexeme, bond = pre (if prefix) or break (if prearg or prebreak),
-- to minimize unparsing chaos.

exprProps: Props;
tryPre, tryPost, tryBreak: BOOLFALSE;

-- Assert: NOT lexer.eof AND not lex is a closefix
IF parser.lexer.eof OR parser.lexProps.closefix THEN ERROR;

openfix ← FALSE;
IF parser.lexer.error # NIL THEN
{parser.error ← Rope.Cat["Lexer error: ", parser.lexer.error]};
-- We may have decided for a break without examining lexProps.
-- Finish checking if it is compatible with current bond.

IF bond.type = break AND NOT parser.lexProps.postbreak THEN
{parser.error ← "an expression cannot start with the next symbol"};
IF parser.error # NIL THEN
{expr ← missing; bond ← [break, 0];
RETURN};
expr ← parser.lexer.lex; openfix ← parser.lexProps.openfix;
-- Save "pre" properties of expr
exprProps ← parser.lexProps;
Lex[parser.lexer]; GetLexProps[parser];

-- determine bond between expr and lex, based on exprProps and lexProps
-- Assume lexProps is defined even if lexer.eof or lexer.error (for next lexeme)
tryBreak ← exprProps.prebreak; -- next call (if any) checks lexProps.postbreak
tryPre ← exprProps.prefix AND parser.lexProps.postarg;
tryPost ← exprProps.prearg AND parser.lexProps.postfix;
IF tryPre AND tryPost THEN
{parser.error ← "operator type ambiguity";
bond ← [pre, exprProps.prepower]} -- for unparser's sake
ELSE IF tryPre THEN
{bond ← [pre, exprProps.prepower]}
ELSE IF tryPost THEN
{bond ← [post, parser.lexProps.postpower]}
ELSE IF tryBreak THEN
{} -- bond ← [break, 0]
ELSE
{parser.error ← "argument or postfix operator expected";
IF exprProps.prefix AND NOT (exprProps.prearg OR exprProps.prebreak) THEN
{bond ← [pre, exprProps.prepower]} -- for unparser's sake
}; -- bond ← [break, 0]
END;

GetNextAtomicExpr: PROC =

BEGIN
-- Sets expr ← next atomic expr (lexeme or bracketed expression) as normal Se
-- Sets bond ← bond between expr and next sub-expression.
-- In case of error, returns either
-- expr = <missing>, bond = break,
-- or expr = lexeme, bond = pre (if prefix) or break (if prearg or prebreak),
-- or expr = ( <openfix> <arg> ... <arg> <arg*> <missing>), bond = break
-- where <arg*> is of one of the abovedescribed forms. This is
-- to minimize unparsing chaos.

-- Assert: NOT lexer.eof AND not lex is a closefix
GetNextLexeme[];
--check for openfix
IF openfix THEN -- collect expressions until matching closefix
BEGIN

open: Lexeme ← expr;
temp: OpenSe ← LIST[expr]; temp.rest ← temp;
UNTIL parser.lexer.eof OR parser.lexProps.closefix OR parser.error # NIL DO
DoParse[]; -- this checks also whether lexer.error#NIL
temp ← AddOperand[temp, parser.tree]; -- even if parse.error
ENDLOOP;
-- check for closefix
IF lexer.eof OR
NOT (parser.lexProps.closefix AND parser.lexProps.matches # open) THEN
{IF parser.error = NIL THEN
{parser.error ← "missing matching delimiter"};
-- flag missing closefix (for unparser's sake)
temp ← AddOperand[temp, missing]};
GetNextLexeme[];
expr ← CloseSe[temp];
IF parser.error # NIL THEN RETURN; -- with bond = break

END;

END;

stack: StackPtr ← NIL; -- stack of incomplete formulas

op: LIST OF Se ← NIL;

GetNextAtomicExpr[];
UNTIL bond.type = break DO -- parse loop

-- note: parse.error may be set here, but in that case bond#post
IF bond.type = pre THEN -- make into open Se
{op ← LIST[expr];
op.rest ← op}
ELSE -- collect all postfix/infix operators and make into close/open Se
BEGIN
WHILE bond.type = post DO
WHILE stack # NIL AND stack.first.power > bond.power DO
expr ← ApplySe [stack.first.op, expr];
stack ← Pop[stack].new
ENDLOOP;
IF stack# NIL AND stack.first.power = bond.power THEN
{parser.error ← "ambiguous binding";
bond ← [break, 0]}
ELSE
{arg: Se ← expr;
GetNextAtomicExpr[]; -- may turn on error
op ← LIST [expr, arg];
IF bond.type = pre THEN -- expr was infix operator, treat op as prefix
{op.rest.rest ← op;
op ← op.rest}
ELSE -- expr was pure postfix operator
{expr ← op}}
ENDLOOP
END;
-- Now either bond = pre and op is the open expr. for the prefix, or
-- bond = break and expr is the closed form of the argument.
IF bond.type = pre THEN
{WHILE stack#NIL AND stack.first.power >= bond.power DO
op ← ComposeSe[stack.first.op, op];
stack ← Pop[stack].new
ENDLOOP;
stack ← Push[[op: op, power: bond.power], stack];
IF NOT lexer.eof AND parser.error = NIL THEN
{GetNextAtomicExpr[]}}
ELSE
{WHILE stack#NIL DO
expr ← ApplySe[stack.first.op, expr];
stack ← Pop[stack].new
ENDLOOP}

ENDLOOP;
parser.tree ← expr

END;

StartLexer[lexer, rope];
parser.lexer ← lexer;
GetLexProps[parser];
parser.eof ← FALSE;
parser.error ← NIL;
IF lexer.eof THEN
{parser.eof ← TRUE; parser.tree ← missing}
ELSE IF parser.lexProps.closefix THEN
{parser.error ← "expression begins with closing delimiter";
parser.tree ← missing}
ELSE
{DoParse[]};
RETURN
[parser.tree,
parser.error,
Rope.Cat[lexer.buf, Rope.Substr[lexer.stream.rope, lexer.stream.pos]]]
END;

- - - - SYNTAX CHECKING

Or: PUBLIC PROC [aw1, aw2: VerdictAndCulprit]
RETURNS [r: VerdictAndCulprit] =

BEGIN
SELECT TRUE FROM
aw1.verdict = Yes OR aw2.verdict = Yes => r ← [Yes, NIL];
aw1.verdict = No => r ← aw1;
aw2.verdict = No => r ← aw2;
aw1.verdict = OfCourseNot AND aw2.verdict = OfCourseNot
=> r ← aw1
ENDCASE => ERROR
END;

HasForm: PUBLIC PROC[expr: Se, op: ATOM, Arg1, Arg2: SyntacticPredicate ← NIL]
RETURNS [VerdictAndCulprit] =

BEGIN
WITH Alg.Car[NARROW[expr]] SELECT FROM
g: LIST OF REF ANY =>
{IF Alg.Car[g] # op THEN RETURN [[OfCourseNot, g]];
IF (Arg2 = NIL) # (Alg.Cdr[Alg.Cdr[g]] = NIL)
THEN RETURN[[OfCourseNot, expr]];
{aw: VerdictAndCulprit ← Arg1[Alg.Cdr[g]];
IF aw.verdict # Yes THEN RETURN [[No, aw.culprit]];
IF Alg.Cdr[Alg.Cdr[g]] = NIL THEN RETURN [[Yes, NIL]];
aw ← Arg2[Alg.Cdr[Alg.Cdr[g]]];
IF aw.verdict # Yes THEN RETURN [[No, aw.culprit]];
RETURN [[Yes, NIL]]}}
ENDCASE => RETURN [[OfCourseNot, expr]]
END;

END
.
Edited on February 21, 1984 1:35 am, by Stolfi
Parser rewritten, using flexible op properties.