- - - - - PARSE/UNPARSE TABLES
NewSyntax:
PUBLIC
PROC
RETURNS [syntax: Syntax] =
BEGIN
syntax ← NEW [SyntaxRec ←
[maxAtomLength: 200,
maxStringLength: 200,
chType: ALL [op],
twoCharOps: NIL,
defaultAtomProps:
[prebreak: TRUE,
prearg: TRUE,
postbreak: TRUE,
postarg: TRUE,
alpha: TRUE],
defaultLitProps:
[prebreak: TRUE,
prearg: TRUE,
postbreak: TRUE,
postarg: TRUE]
]];
SetDefaultCharTypes[syntax]
END;
SetCharType:
PUBLIC
PROC[syntax: Syntax, from, to:
CHAR, chType: CharType] =
BEGIN
FOR c: CHAR IN [from..to] DO syntax.chType[c] ← chType ENDLOOP
END;
SetDefaultCharTypes:
PUBLIC
PROC[syntax: Syntax] =
BEGIN OPEN syntax;
ops: Rope.ROPE = "+-*/|\\()[]{}<>=~@#$%&^←:;,.!?";
FOR c: CHAR IN [0C .. 255C] DO chType[c] ← other ENDLOOP;
FOR c: CHAR IN ['a .. 'z] DO chType[c] ← letter ENDLOOP;
FOR c: CHAR IN ['A .. 'Z] DO chType[c] ← letter ENDLOOP;
FOR c: CHAR IN ['0 .. '9] DO chType[c] ← digit ENDLOOP;
FOR i: INT IN [0..ops.Length[]) DO chType[ops.Fetch[i]] ← op ENDLOOP;
chType['"] ← quote;
chType[IO.CR] ← blank;
chType[IO.SP] ← blank;
chType[IO.LF] ← blank;
chType[IO.FF] ← blank;
chType[IO.NUL] ← blank;
chType[IO.TAB] ← blank
END;
EnterTwoCharOp:
PUBLIC
PROC [syntax: Syntax, c1, c2:
CHAR] =
BEGIN
atom: ATOM = Atom.MakeAtom[Rope.Cat[Rope.FromChar[c1], Rope.FromChar[c2]]];
syntax.twoCharOps ← CONS[[chars: [c1, c2], atom: atom], syntax.twoCharOps]
END;
EnterAlias:
PUBLIC
ENTRY PROC [syntax: Syntax, alias, standard:
ATOM] =
BEGIN
Atom.PutProp[alias, syntax, standard]
END;
EnterAtomProps:
PUBLIC
ENTRY
PROC [syntax: Syntax, name:
ATOM, props: Props] =
BEGIN
-- openfix operators must be just prebreak
IF props.openfix AND
(props.closefix OR props.prefix OR props.prearg OR NOT props.prebreak) THEN ERROR;
-- closefix operators must be just postbreak
IF props.closefix AND
(props.openfix OR props.postfix OR props.postarg OR NOT props.postbreak) THEN ERROR;
-- The next test prevents ambiguous unparsing:
IF props.prefix AND (props.prearg OR props.prebreak) AND
props.postfix AND (props.postarg OR props.postbreak) THEN ERROR;
Atom.PutProp[name, syntax, NEW [Props ← props]]
END;
- - - - - LEXER
Simple lexer for alpha-numeric identifiers, unsigned ints, unsigned reals w/o trailing E's, single and double-character operators, and quoted strings with no funny business embedded in them. No two-line strings, either.
Lexeme: TYPE = Se; -- Lexical item (REF INT, REF REAL, ATOM, or ROPE)
Lexer: TYPE = REF LexerRec;
Stream: TYPE = RECORD [pos: INT, rope: ROPE, len: INT]; -- fake stream
LexerRec: TYPE = RECORD
[syntax: Syntax, -- Syntax tables
stream: Stream, -- Source of characters to be broken into lexemes.
lex: Lexeme, -- current lexeme, the result of the most recent call to Lex
eof: BOOL, -- set by Lex when lexeme was not found.
buf: ROPE, -- contains the substring that lexed to lex
error: Rope.ROPE] ; -- initially NIL, set to error message on lexical error
NewLexer:
PROC [syntax: Syntax]
RETURNS [lexer: Lexer] =
-- creates a new
Lexer with empty input stream (eof = TRUE)
-- All of
lexer's fields are initialized to default values.
-- The
lexer.
chType is initialized by SetDefaultCharTypes below.
-- To use the
lexer,
-- 1) set up
lexer.
chType, and call
EnterTwoCharOp,
EnterAlias
, and
SetMaxLengths
-- as needed, to define the lexeme syntax
-- 2) attach a stream to the
lexer by using
StartLexer
-- 3) execute
WHILE NOT lexer.eof
DO
<use lexer.lex
>; Lex[lexer]
ENDLOOP.
-- The same lexer can be reused for as many streams as needed; the
-- lexical tables and aliases need to be set up just once.
BEGIN
lexer ← NEW [LexerRec ←
[syntax: syntax,
stream: [0, NIL, 0],
lex: NIL,
eof: TRUE,
buf: NIL,
error: NIL]]
END;
StartLexer:
PROC [lexer: Lexer, rope:
ROPE] =
-- Attaches the specified
stream to the
lexer, and puts the first lexeme into lexer.lex
-- (lexer.eof and lexer.error have same meaning as for Lex[]).
-- The lexeme syntax and aliases must have been set up before calling
StartLexer
.
-- Further lexemes are obtained by calling Lex[lexer] (see below)
BEGIN
lexer.stream ← [0, rope, rope.Length[]];
lexer.eof ← FALSE;
Lex[lexer]
END;
Lex:
PUBLIC
PROC[lexer: Lexer] =
-- Skips any blanks from stream lexer.stream, and parses the longest prefix
-- of the same that is a valid lexeme, returning it in lexer.lex
-- If the stream is exausted or contains only blanks, then
-- returns lexer.lex = NIL, lexer.eof = TRUE.
-- If the stream is not exhausted, but no prefix of it is a valid lexeme,
-- returns lexer.lex = NIL, lexer.error = some non-nil message. In
-- this case, the characters removed from lexer.stream are put back, so
-- you can reread them.
BEGIN
OPEN lexer;
EndOfStream:
PROC
RETURNS [
BOOL] =
INLINE
-- returns true if next char in lexer.stream is c
BEGIN
RETURN [stream.pos >= stream.len]
END;
CurChar:
PROC
RETURNS [c:
CHAR] =
INLINE
-- returns next char in lexer.stream
BEGIN
RETURN [stream.rope.Fetch[stream.pos]]
END;
SeeChar:
PROC [c:
CHAR]
RETURNS [
BOOL] =
INLINE
-- returns true if next char in lexer.stream is c
BEGIN
RETURN [NOT EndOfStream[] AND CurChar[] = c]
END;
SeeType:
PROC [t: CharType]
RETURNS [
BOOL] =
INLINE
-- returns true if next char in lexer.stream is of type t
BEGIN
RETURN [NOT EndOfStream[] AND syntax.chType[CurChar[]] = t]
END;
Skip:
PROC =
INLINE
-- advances lexer.stream.
BEGIN
stream.pos ← stream.pos+1
END;
first: INT; -- position of first char of current lexeme in lexer.stream
NextID:
PROC =
BEGIN
Skip[];
WHILE SeeType[letter] OR SeeType[digit] DO
IF stream.pos-first >= syntax.maxAtomLength THEN
{error ← "identifier too long"; RETURN};
Skip[]
ENDLOOP;
lex ← MapAlias
[syntax, Atom.MakeAtom[Rope.Substr[stream.rope, first, stream.pos-first]]];
END;
NextNumber:
PROC =
BEGIN
d: INT;
n: INT ← CurChar[] - '0; Skip[];
WHILE SeeType[digit] DO
d ← CurChar[] - '0;
IF n > (LAST[INT] - d)/10 THEN
{error ← "number too big"; RETURN};
n ← n * 10 + d;
Skip[]
ENDLOOP;
IF SeeChar['.] THEN -- get decimal part
{x: REAL ← n;
y: REAL ← 0.1;
Skip[];
WHILE SeeType[digit] DO
x ← x + y * (CurChar[] - '0);
y ← y/10;
Skip[]
ENDLOOP;
lex ← NEW[REAL ← x]}
ELSE
{lex ← NEW[INT ← n]}
END;
NextOp:
PROC =
BEGIN
c1: CHAR ← CurChar[]; Skip[];
-- Check if it is a two-character op
IF SeeType[op] THEN
{c2: CHAR = CurChar[];
FOR ops: LIST OF TwoCharOp ← syntax.twoCharOps, ops.rest UNTIL ops = NIL DO
IF ops.first.chars = [c1, c2] THEN
{lex ← MapAlias[syntax, ops.first.atom]; Skip[]; RETURN}
ENDLOOP};
-- Nope - single char op
lex ← MapAlias[syntax, Atom.MakeAtomFromChar[c1]]
END;
NextString:
PROC =
BEGIN
delim: CHAR ← CurChar[] ; Skip[];
WHILE NOT EndOfStream[] AND CurChar[] # delim DO
IF CurChar[] = IO.CR OR CurChar[] = IO.LF
OR SeeType[invalid] THEN
{error ← "invalid character in string"; RETURN};
IF stream.pos-first >= syntax.maxStringLength THEN
{error ← "string too long"; RETURN};
Skip[]
ENDLOOP;
IF NOT SeeChar[delim] THEN
{error ← "runaway string"; RETURN};
lex ← Rope.Substr[stream.rope, first+1, stream.pos-first-1];
Skip[]
END;
buf ← NIL;
error ← NIL;
lex ← NIL;
-- Skip blanks
WHILE SeeType[blank] DO stream.pos←stream.pos+1 ENDLOOP;
-- Decide type of next token
IF EndOfStream[] THEN
{lexer.eof ← TRUE; RETURN};
first ← stream.pos;
SELECT syntax.chType[CurChar[]] FROM
letter => NextID[];
digit => NextNumber[];
op => NextOp[];
quote => NextString[];
ENDCASE => {error ← "invalid character"};
IF error # NIL THEN -- backup stream to beginning of parsed string
{stream.pos ← first}
ELSE
{buf ← Rope.Substr[stream.rope, first, stream.pos-first]}
END;
MapAlias:
PUBLIC ENTRY PROC [syntax: Syntax, atom:
ATOM]
RETURNS [standard:
ATOM] =
BEGIN
pref: REF ← Atom.GetProp[atom, syntax];
RETURN [IF pref = NIL THEN atom ELSE NARROW[pref]]
END;
- - - - - PARSER
Simple operator precedence parser for operators of the following binding types: infix, prefix, postfix, selfix (i.e., variables or literals), nullfix ('noise' lexemes to be ignored) openfix (like "begin" or left parenthesis), and closefix (like "end" or right parenthesis).
Parser: TYPE = REF ParserRec;
ParserRec: TYPE = RECORD
[syntax: Syntax,
lexer: Lexer, -- Source of lexemes to be parsed.
lexProps: Props, -- of lexer.lex (if lexer.eof or lexer.error then [postbreak: TRUE].)
tree: Se, -- contains last thing produced by Parse.
eof: BOOL, -- set to TRUE when there is nothing more to parse.
error: Rope.ROPE -- set to error message on lexical error
];
NewParser:
PROC [syntax: Syntax]
RETURNS [parser: Parser] =
-- Creates a new
Parser
with empty lexeme stream (eof = TRUE)
-- All of parser
's fields are initialized to default values.
-- To use the
lexer,
-- 1) call
EnterAtomProps
as needed, to define operator types and precedences
-- 2) set up a
Lexer, attach some character stream to it (using
StartLexer),
-- and attach that
Lexer to the
parser by using
StartParser
-- 3) execute
WHILE NOT parser.eof
DO
<use parser.tree
>;
Parse[parser]
ENDLOOP.
-- The same parser
can be reused for as many lexeme streams as needed; the
-- operator properties need to be defined just once.
BEGIN
parser ← NEW[ParserRec ←
[syntax: syntax,
lexer: NIL,
lexProps: [postbreak: TRUE],
tree: NIL,
eof: TRUE,
error: NIL
]]
END;
GetProps:
PUBLIC PROC [lex: Se, syntax: Syntax]
RETURNS [Props] =
-- returns properties of atom or literal (explicitly defined or default
BEGIN
IF ISTYPE [lex, ATOM] THEN
{r: REF ANY ← Atom.GetProp[NARROW[lex], syntax];
RETURN [IF r # NIL THEN NARROW[r, REF Props]^ ELSE syntax.defaultAtomProps]}
ELSE
RETURN [syntax.defaultLitProps]
END;
GetLexProps:
PROC [parser: Parser] =
BEGIN
IF parser.lexer.eof OR parser.lexer.error # NIL THEN
{parser.lexProps ← [postbreak: TRUE]}
ELSE
{parser.lexProps ← GetProps[parser.lexer.lex, parser.syntax]}
END;
OpenSe: TYPE = LIST OF Se; -- last cell of an incomplete tree node; cdr points to root
PrefixRec: TYPE = RECORD
[op: OpenSe, -- prefix (or infix with left argument in place)
power: BindingPower -- binding power
];
StackPtr: TYPE = LIST OF PrefixRec;
stackAvail: StackPtr ← NIL;
Push:
ENTRY
PROC [data: PrefixRec, stack: StackPtr]
RETURNS [new: StackPtr] =
BEGIN
IF stackAvail = NIL THEN
{new ← CONS[data, stack]}
ELSE
{new ← stackAvail; stackAvail ← stackAvail.rest;
new.rest ← stack; new.first ← data}
END;
Pop:
ENTRY
PROC [stack: StackPtr]
RETURNS [data: PrefixRec, new: StackPtr] =
BEGIN
data ← stack.first; new ← stack.rest;
stack.rest ← stackAvail; stackAvail ← stack
END;
AddOperand:
PROC [op: OpenSe, arg: Se]
RETURNS [new: OpenSe] =
INLINE
{new ← CONS [arg, op.rest]; op.rest ← new};
CloseSe:
PROC [op: OpenSe]
RETURNS [expr: Se] =
INLINE
{expr ← op.rest; op.rest ← NIL};
ApplySe:
PROC [op1: OpenSe, arg: Se]
RETURNS [new: Se] =
INLINE
{new ← CloseSe[AddOperand [op1, arg]]};
ComposeSe:
PROC [op1, op2: OpenSe]
RETURNS [new: OpenSe] =
INLINE
{new ← AddOperand [op1, op2];
op2.rest ← op1.rest; op1.rest ← NIL; new ← op2.rest};
missing: PUBLIC Se ← Atom.MakeAtom["<<MISSING>>"];
Parse:
PUBLIC
PROC [rope:
ROPE, syntax: Syntax]
RETURNS [expr: Se, error:
ROPE, rest:
ROPE]=
BEGIN
lexer: Lexer ← NewLexer [syntax];
parser: Parser ← NewParser[syntax];
DoParse:
PROC =
-- Parses lexeme stream until next break, returns result in parser.tree as
-- closed expression.
-- In case of parsing error, sets parser.error # NIL. The parser.tree
-- will be the parsed piece (as a closed Se); it may be NIL, or some operators
-- in it may have NILs as operands.
BEGIN
Bond: TYPE = RECORD -- bond between two atomic sub-expressions A, B
[type: BondType, power: BindingPower ← 0];
BondType: TYPE = {pre, post, break};
-- pre: A applies to B,
-- post: B applies to A,
-- break: neither (expression being parsed ends after A)
expr: Se ← NIL; -- Last atomic sub-expr (lexeme or bracketed expression) parsed.
openfix: BOOL ← FALSE; -- set by GetNextLexeme if expr is openfix
bond: Bond ← [break, 0]; -- bond between expr and next atomic sub-expression.
GetNextLexeme:
PROC =
BEGIN
-- Sets expr ← current lexeme, and gets the next one into lexer.lex
-- Sets bond ← bond between expr and next sub-expression.
-- In case of error, returns either
-- expr = <missing>, bond = break,
-- or expr = lexeme, bond = pre (if prefix) or break (if prearg or prebreak),
-- to minimize unparsing chaos.
exprProps: Props;
tryPre, tryPost, tryBreak: BOOL ← FALSE;
-- Assert: NOT lexer.eof AND not lex is a closefix
IF parser.lexer.eof OR parser.lexProps.closefix THEN ERROR;
openfix ← FALSE;
IF parser.lexer.error # NIL THEN
{parser.error ← Rope.Cat["Lexer error: ", parser.lexer.error]};
-- We may have decided for a break without examining lexProps.
-- Finish checking if it is compatible with current bond.
IF bond.type = break AND NOT parser.lexProps.postbreak THEN
{parser.error ← "an expression cannot start with the next symbol"};
IF parser.error # NIL THEN
{expr ← missing; bond ← [break, 0];
RETURN};
expr ← parser.lexer.lex; openfix ← parser.lexProps.openfix;
-- Save "pre" properties of expr
exprProps ← parser.lexProps;
Lex[parser.lexer]; GetLexProps[parser];
-- determine bond between expr and lex, based on exprProps and lexProps
-- Assume lexProps is defined even if lexer.eof or lexer.error (for next lexeme)
tryBreak ← exprProps.prebreak; -- next call (if any) checks lexProps.postbreak
tryPre ← exprProps.prefix AND parser.lexProps.postarg;
tryPost ← exprProps.prearg AND parser.lexProps.postfix;
IF tryPre AND tryPost THEN
{parser.error ← "operator type ambiguity";
bond ← [pre, exprProps.prepower]} -- for unparser's sake
ELSE IF tryPre THEN
{bond ← [pre, exprProps.prepower]}
ELSE IF tryPost THEN
{bond ← [post, parser.lexProps.postpower]}
ELSE IF tryBreak THEN
{} -- bond ← [break, 0]
ELSE
{parser.error ← "argument or postfix operator expected";
IF exprProps.prefix AND NOT (exprProps.prearg OR exprProps.prebreak) THEN
{bond ← [pre, exprProps.prepower]} -- for unparser's sake
}; -- bond ← [break, 0]
END;
GetNextAtomicExpr:
PROC =
BEGIN
-- Sets expr ← next atomic expr (lexeme or bracketed expression) as normal Se
-- Sets bond ← bond between expr and next sub-expression.
-- In case of error, returns either
-- expr = <missing>, bond = break,
-- or expr = lexeme, bond = pre (if prefix) or break (if prearg or prebreak),
-- or expr = ( <openfix> <arg> ... <arg> <arg*> <missing>), bond = break
-- where <arg*> is of one of the abovedescribed forms. This is
-- to minimize unparsing chaos.
-- Assert: NOT lexer.eof AND not lex is a closefix
GetNextLexeme[];
--check for openfix
IF openfix
THEN
-- collect expressions until matching closefix
BEGIN
open: Lexeme ← expr;
temp: OpenSe ← LIST[expr]; temp.rest ← temp;
UNTIL parser.lexer.eof OR parser.lexProps.closefix OR parser.error # NIL DO
DoParse[]; -- this checks also whether lexer.error#NIL
temp ← AddOperand[temp, parser.tree]; -- even if parse.error
ENDLOOP;
-- check for closefix
IF lexer.eof OR
NOT (parser.lexProps.closefix AND parser.lexProps.matches # open) THEN
{IF parser.error = NIL THEN
{parser.error ← "missing matching delimiter"};
-- flag missing closefix (for unparser's sake)
temp ← AddOperand[temp, missing]};
GetNextLexeme[];
expr ← CloseSe[temp];
IF parser.error # NIL THEN RETURN; -- with bond = break
END;
END;
stack: StackPtr ← NIL; -- stack of incomplete formulas
op: LIST OF Se ← NIL;
GetNextAtomicExpr[];
UNTIL bond.type = break
DO
-- parse loop
-- note: parse.error may be set here, but in that case bond#post
IF bond.type = pre THEN -- make into open Se
{op ← LIST[expr];
op.rest ← op}
ELSE -- collect all postfix/infix operators and make into close/open Se
BEGIN
WHILE bond.type = post DO
WHILE stack # NIL AND stack.first.power > bond.power DO
expr ← ApplySe [stack.first.op, expr];
stack ← Pop[stack].new
ENDLOOP;
IF stack# NIL AND stack.first.power = bond.power THEN
{parser.error ← "ambiguous binding";
bond ← [break, 0]}
ELSE
{arg: Se ← expr;
GetNextAtomicExpr[]; -- may turn on error
op ← LIST [expr, arg];
IF bond.type = pre THEN -- expr was infix operator, treat op as prefix
{op.rest.rest ← op;
op ← op.rest}
ELSE -- expr was pure postfix operator
{expr ← op}}
ENDLOOP
END;
-- Now either bond = pre and op is the open expr. for the prefix, or
-- bond = break and expr is the closed form of the argument.
IF bond.type = pre THEN
{WHILE stack#NIL AND stack.first.power >= bond.power DO
op ← ComposeSe[stack.first.op, op];
stack ← Pop[stack].new
ENDLOOP;
stack ← Push[[op: op, power: bond.power], stack];
IF NOT lexer.eof AND parser.error = NIL THEN
{GetNextAtomicExpr[]}}
ELSE
{WHILE stack#NIL DO
expr ← ApplySe[stack.first.op, expr];
stack ← Pop[stack].new
ENDLOOP}
ENDLOOP;
parser.tree ← expr
END;
StartLexer[lexer, rope];
parser.lexer ← lexer;
GetLexProps[parser];
parser.eof ← FALSE;
parser.error ← NIL;
IF lexer.eof THEN
{parser.eof ← TRUE; parser.tree ← missing}
ELSE IF parser.lexProps.closefix THEN
{parser.error ← "expression begins with closing delimiter";
parser.tree ← missing}
ELSE
{DoParse[]};
RETURN
[parser.tree,
parser.error,
Rope.Cat[lexer.buf, Rope.Substr[lexer.stream.rope, lexer.stream.pos]]]
END;
- - - - SYNTAX CHECKING
Or:
PUBLIC
PROC [aw1, aw2: VerdictAndCulprit]
RETURNS [r: VerdictAndCulprit] =
BEGIN
SELECT TRUE FROM
aw1.verdict = Yes OR aw2.verdict = Yes => r ← [Yes, NIL];
aw1.verdict = No => r ← aw1;
aw2.verdict = No => r ← aw2;
aw1.verdict = OfCourseNot AND aw2.verdict = OfCourseNot
=> r ← aw1
ENDCASE => ERROR
END;
HasForm:
PUBLIC
PROC[expr: Se, op:
ATOM, Arg1, Arg2: SyntacticPredicate ←
NIL]
RETURNS [VerdictAndCulprit] =
BEGIN
WITH Alg.Car[NARROW[expr]] SELECT FROM
g: LIST OF REF ANY =>
{IF Alg.Car[g] # op THEN RETURN [[OfCourseNot, g]];
IF (Arg2 = NIL) # (Alg.Cdr[Alg.Cdr[g]] = NIL)
THEN RETURN[[OfCourseNot, expr]];
{aw: VerdictAndCulprit ← Arg1[Alg.Cdr[g]];
IF aw.verdict # Yes THEN RETURN [[No, aw.culprit]];
IF Alg.Cdr[Alg.Cdr[g]] = NIL THEN RETURN [[Yes, NIL]];
aw ← Arg2[Alg.Cdr[Alg.Cdr[g]]];
IF aw.verdict # Yes THEN RETURN [[No, aw.culprit]];
RETURN [[Yes, NIL]]}}
ENDCASE => RETURN [[OfCourseNot, expr]]
END;