GVMailParseImpl.mesa
Copyright © 1985 by Xerox Corporation. All rights reserved.
Based on ArpaMailParser.mesa by Brotz, March 6, 1983 3:29 PM
Last Edited by: Willie-sue, May 30, 1985 1:15:20 pm PDT
Doug Wyatt, March 7, 1985 11:07:34 am PST
DIRECTORY
GVMailParse USING [endOfInput, endOfList, ParseErrorCode],
IO,
Rope;
GVMailParseImpl: CEDAR PROGRAM
IMPORTS IO, Rope
EXPORTS GVMailParse
= BEGIN OPEN GVMailParse, Rope, IO;
Types
Some special characters specified by RFC 822
openComment: CHAR = '(;
closeComment: CHAR = ');
quoteNext: CHAR = '\\;
quotes: CHAR = '";
comma: CHAR = ',;
colon: CHAR = ':;
semiColon: CHAR = ';;
openRoute: CHAR = '<;
closeRoute: CHAR = '>;
openSquareBracket: CHAR = '[;
closeSquareBracket: CHAR = '];
dot: CHAR = '.;
atSign: CHAR = '@;
TokenType: TYPE =
{atom, dot, atSign, comma, colon, semiColon, openRoute, closeRoute, domainLiteral,
endOfLine, endOfList, endOfInput};
ParseInfo: PUBLIC TYPE = RECORD
[ strm, tokenStrm: IO.STREAM,
giveACR: BOOLFALSE,
char: CHARNUL];
ParseHandle: TYPE = REF ParseInfo;
ParseError: PUBLIC ERROR [code: ParseErrorCode] = CODE;
SyntaxError: ERROR = CODE;
InitializeParse: PUBLIC PROC RETURNS [ph: ParseHandle] =
Initializes the header parser, and returns a ParseHandle which is to be passed to all other
procedures of this interface. Subsequent invocations of GetFieldName, GetFieldBody,
and ParseNameList will obtain their input using "next". If "notifyTruncation" is TRUE,
GetFieldName and GetFieldBody will raise ParseError[truncated] if the string they are
collecting overflows the string provided. (The signal is not raised until the entire field
name or body has been scanned.) If "notifyTruncation" is FALSE, this signal is
suppressed.
{ ph ← NEW[ParseInfo← [strm: ROS[], tokenStrm: ROS[]]] }; -- of InitializeParse
FinalizeParse: PUBLIC PROC [ph: ParseHandle] = {
Finalizes the parser instance specified by "pH". This procedure must be called when the
client has finished parsing, either because of normal completion or because some error
has occurred. After calling this procedure, "pH" is no longer meaningful and must not
be reused. Note: FinalizeParse may not be called while a call to ParseNameList is
pending (for the same ParseHandle).
ph.strm.Close[];
ph.tokenStrm.Close[];
}; -- of FinalizeParse --
GetFieldName: PUBLIC PROC [ph: ParseHandle, next: PROC RETURNS [CHAR]]
RETURNS [fieldName: ROPE, found: BOOL] = {
GetFieldName presumes that "next" (see InitializeParse) is positioned to read the first
character of a field name and returns the field name, without the terminating colon,
as "fieldName". GetFieldName leaves "next" ready to return the first character
following the colon (or, if the end of the message header has been reached, the
character (if any) after the two CRs that normally terminate the header).
Upon return, "found" is FALSE if no field names remain in the header.
If the header field ends prematurely or illegal header characters are encountered,
ParseError[badFieldName] is raised.
char: CHAR;
blanks: BOOLFALSE;
ph.strm.Reset[];
DO
SELECT char ← Get[ph, next] FROM
CR, endOfInput =>
{ fieldName← RopeFromROS[ph.strm, FALSE];
IF fieldName.Length[] = 0 THEN RETURN[fieldName, FALSE]
ELSE ParseError[badFieldName];
};
': => RETURN[RopeFromROS[ph.strm, FALSE], TRUE];
SP, TAB => blanks ← TRUE;
endOfList, < 40C => ERROR ParseError[badFieldName];
ENDCASE =>
SELECT TRUE FROM
blanks => ERROR ParseError[badFieldName];
ENDCASE => ph.strm.PutChar[char];
ENDLOOP;
}; -- of GetFieldName --
GetFieldBody: PUBLIC PROC
[ph: ParseHandle, next: PROC RETURNS [CHAR],
suppressBody: BOOLFALSE, suppressWhiteSpace: BOOLFALSE]
RETURNS [fieldBody: ROPE] = {
The (remainder of the) current field body is read using "next" (see InitializeParse)
and is returned as "fieldBody". If the field body terminates before a CR is seen,
ParseError[badFieldBody] is raised. Upon return, "fieldBody" has no initial or
terminal white space (blanks and tabs) and, if "suppressWhiteSpace" is TRUE,
each internal run of white space has been replaced by a single blank.
ArpaNet folding conventions are also observed.
IF suppressBody is TRUE, then no fieldBody is generated
char: CHAR;
spaceSeen: BOOLTRUE; -- TRUE means ignore leading spaces
RemoveTrailingSpace: PROC = INLINE
BEGIN
len: INT← fieldBody.Length[];
WHILE len > 0 AND fieldBody.Fetch[len-1] = SP DO
len← len - 1;
ENDLOOP;
fieldBody← fieldBody.Substr[0, len];
END; -- of RemoveTrailingSpace --
IF ~suppressBody THEN ph.strm.Reset[];
DO
SELECT char ← Get[ph, next] FROM
SP, TAB =>
BEGIN
IF spaceSeen THEN LOOP;
IF suppressWhiteSpace THEN {char ← SP; spaceSeen ← TRUE};
END;
endOfInput => GO TO Trouble;
endOfList, CR => EXIT;
ENDCASE => spaceSeen ← FALSE;
IF ~suppressBody THEN ph.strm.PutChar[char];
ENDLOOP;
IF ~suppressBody THEN
{ fieldBody← RopeFromROS[ph.strm, FALSE]; RemoveTrailingSpace[]};
EXITS
Trouble => ERROR ParseError[badFieldBody];
}; -- of GetFieldBody --
ParseNameList: PUBLIC PROC
[ph: ParseHandle,
next: PROC RETURNS[CHAR],
process: PROC [ROPE, ROPE, BOOL, BOOL] RETURNS [ROPE, BOOL],
process PROC [simpleName, registry, isFile, isNested] RETURNS [reg, write] --
write: PROC [CHAR] ← NIL ] = {
ParseNameList expects to read characters using "next" (see InitializeParse) for a structured
field body consisting of a list of recipient names. For each such name encountered, it
will call "process", passing it two string arguments that designate the simple name and
registry. The simple name is always non-empty. If the registry is absent, a string of
length zero (not NIL) is passed. If the simple name contains an Arpanet host name,
the registry passed is "Arpa". The rope parameters are free from leading, trailing
and excess internal white space The "process" routine has a third parameter that indicates,
if TRUE, that the simple name is a file name, if FALSE, that the simple name and
registry combine to form a normal name. The fourth parameter supplied to "process"
indicates, if TRUE, that the name was "nested", i.e., it occurred within brackets or
within a group. This is useful to the Answer client who may wish to suppress
duplicate elimination in such cases.
If any syntax errors are detected during parsing, ParseError[badFieldBody] is raised. It is
legitimate for the "process" routine to raise a signal that causes ParseNameList to be
unwound.
error: ParseErrorCode ← none;
name, registry, token: ROPE;
reg: ROPE;
doWrite: BOOL;
outputStrm: IO.STREAMIF write = NIL THEN NIL ELSE ph.strm;
tokenStrm: IO.STREAM← ph.tokenStrm;
dotIndex, lastAtomIndex: CARDINAL ← 0;
Local procedure ParseList does all the work. It is a local procedure so we can catch
its ERRORs easily.
ParseList: PROC = {
lookingFor: {name, delim, registry, groupContents, routeAddress};
Semantics of states:
name: expect the first atom of a name or the first atom of a domain.
these can be distinguished by the seenAtSign Boolean.
delim: have just seen an atom, expect a delimiter: dot, atSign, closeRoute, semiColon,
endOfLine, or endOfList.
registry: have just seen a dot, now expect an atom that is a registry candidate.
groupContents: have just seen a group opening colon. This state is ephemeral-we are
looking for an immediate semiColon that will indicate that the name accumulated so far
is a filename. If the next token is not a semiColon, treat as lookingFor=name (but clear
the accumulated name first).
routeAddress: have just seen an openRoute. Handle the awful route syntax:
@domain,@domain, ... @domain:
If not immediately followed by an atSign, treat as lookingFor=name.
Typical sequence of states is
name, delim, (if see dot:)[ registry, delim, (if see atSign:)[name, delim,[registry, delim]*]]
inRoute, inGroup, needAtSign, seenAtSign: BOOL;
oldRegistryLength: CARDINAL;
haveAlreadyWritten: BOOLFALSE;
AppendNameAndRegistry: PROC =
{ IF registry.Length[] > 0 THEN {name← name.Cat[".", registry]; registry← NIL} };
CheckForArpa: PROC = {
arpaRegistry: ROPE = "ARPA";
IF registry.Length[] = 0 THEN {
dotIndex ← 0;
IF seenAtSign THEN registry ← arpaRegistry;
};
oldRegistryLength ← registry.Length[];
}; -- of CheckForArpa --
ProcessPhrase: PROC = {
IF inRoute THEN SyntaxError;
name← registry← NIL;
IF outputStrm # NIL THEN {outputStrm.PutChar[SP]; outputStrm.PutRope[token]};
DO -- Flush till colon or angle bracket.
SELECT GetToken[] FROM
colon =>
BEGIN
inGroup ← TRUE;
lookingFor ← groupContents;
IF outputStrm # NIL THEN outputStrm.PutChar[':];
EXIT;
END;
openRoute =>
BEGIN
lookingFor ← routeAddress;
inRoute ← TRUE;
IF outputStrm # NIL THEN {outputStrm.PutChar[SP]; outputStrm.PutChar['<]};
EXIT;
END;
atom => IF outputStrm # NIL THEN
{outputStrm.PutChar[SP]; outputStrm.PutRope[token]};
dot => IF outputStrm # NIL THEN outputStrm.PutChar['.];
ENDCASE => SyntaxError;
ENDLOOP;
}; -- of ProcessPhrase --
ProcessRoute: PROC = {
name← name.Concat["@"];
needAtSign ← TRUE;
IF outputStrm # NIL THEN outputStrm.PutChar['@];
DO
SELECT GetToken[] FROM
atom, domainLiteral =>
BEGIN
name← name.Concat[token];
IF outputStrm # NIL THEN outputStrm.PutRope[token];
END;
ENDCASE => SyntaxError;
SELECT GetToken[] FROM
dot =>
BEGIN
name← name.Concat["."];
IF outputStrm # NIL THEN outputStrm.PutChar['.];
END;
comma =>
IF GetToken[] = atSign THEN
BEGIN
name← name.Concat[",@"];
IF outputStrm # NIL THEN outputStrm.PutRope[",@"];
END
ELSE SyntaxError;
colon =>
BEGIN
name← name.Concat[":"];
IF outputStrm # NIL THEN outputStrm.PutChar[':];
lookingFor ← name;
EXIT;
END;
ENDCASE => SyntaxError;
ENDLOOP;
}; -- of ProcessRoute --
GetAtom: PROC = {
tooLong: BOOLFALSE;
DO
char: CHAR;
SELECT char ← Get[ph, next] FROM
SP, TAB => EXIT;
dot, atSign, comma, openRoute, closeRoute, endOfList, colon, semiColon,
endOfInput, openSquareBracket, openComment, quotes, closeSquareBracket,
closeComment
=> {ph.char ← char; EXIT};
CR => {ph.giveACR ← TRUE; EXIT};
< 40C, quoteNext, DEL => ERROR SyntaxError;
ENDCASE => tokenStrm.PutChar[char];
ENDLOOP;
}; -- of GetAtom --
GetQuotedString: PROC = {
tooLong: BOOLFALSE;
DO
char: CHAR;
tokenStrm.PutChar[char ← Get[ph, next]];
SELECT char FROM
quoteNext =>
SELECT char ← Get[ph, next] FROM
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE => tokenStrm.PutChar[char];
quotes => EXIT;
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE;
ENDLOOP;
}; -- of GetQuotedString --
GetDomainLiteral: PROC = {
DO
char: CHAR;
tokenStrm.PutChar[char ← Get[ph, next]];
SELECT char FROM
quoteNext =>
SELECT char ← Get[ph, next] FROM
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE => tokenStrm.PutChar[char];
closeSquareBracket => RETURN;
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE;
ENDLOOP;
EXITS
}; -- of GetDomainLiteral --
GetToken: PROC RETURNS [tokenType: TokenType] = {
char: CHAR;
tokenStrm.Reset[];
DO
char ← Get[ph, next];
SELECT char FROM
SP, TAB => LOOP;
openComment => {FlushComment[]; LOOP};
ENDCASE => EXIT;
ENDLOOP;
SELECT char FROM
dot => RETURN[dot];
atSign => RETURN[atSign];
comma => RETURN[comma];
openRoute => RETURN[openRoute];
closeRoute => RETURN[closeRoute];
CR => RETURN[endOfLine];
endOfList => RETURN[endOfList];
endOfInput => RETURN[endOfInput];
colon => RETURN[colon];
semiColon => RETURN[semiColon];
closeSquareBracket, closeComment => ERROR SyntaxError;
openSquareBracket =>
{tokenStrm.PutChar[char]; GetDomainLiteral[]; tokenType← domainLiteral};
quotes => {tokenStrm.PutChar[char]; GetQuotedString[]; tokenType← atom};
ENDCASE => {tokenStrm.PutChar[char]; GetAtom[]; tokenType← atom};
token← RopeFromROS[tokenStrm, FALSE];
}; -- of GetToken --
FlushComment: PROC = {
char: CHAR;
len: INT;
IF outputStrm # NIL THEN
BEGIN
r: ROPE← RopeFromROS[outputStrm, FALSE];
IF (len← r.Length[]) > 0 THEN IF r.Fetch[len-1] # SP THEN outputStrm.PutChar[SP];
outputStrm.PutChar['(];
END;
DO
SELECT (char ← Get[ph, next]) FROM
quoteNext =>
SELECT (char ← Get[ph, next]) FROM
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE => {outputStrm.PutChar['\\]; outputStrm.PutChar[char]};
closeComment =>
{IF outputStrm # NIL THEN outputStrm.PutRope[") "]; RETURN};
openComment => FlushComment[];
CR, endOfList, endOfInput => ERROR SyntaxError;
ENDCASE => IF outputStrm # NIL THEN outputStrm.PutChar[char];
ENDLOOP;
}; -- of FlushComment --
WriteOutput: PROC = {
(dotIndex # 0) = registry exists in the output string.
locate the registry in the output string
output: ROPE← RopeFromROS[outputStrm, FALSE];
len, regLen: INT;
registryIndex: CARDINAL
IF dotIndex # 0 THEN lastAtomIndex - oldRegistryLength ELSE 0;
IF haveAlreadyWritten THEN {write[',]; write[SP]};
haveAlreadyWritten ← TRUE;
write up to the dot
FOR i: INT IN [0 .. IF dotIndex = 0 THEN lastAtomIndex ELSE dotIndex) DO
write[output.Fetch[i]];
ENDLOOP;
write the dot
IF (regLen← reg.Length[]) > 0 THEN write['.];
write the registry
FOR i: INT IN [0 .. regLen) DO
write[reg.Fetch[i]];
ENDLOOP;
write the comments between the dot and the registry
IF dotIndex # 0 THEN
FOR i: CARDINAL IN (dotIndex .. registryIndex) DO
write[output.Fetch[i]];
ENDLOOP;
write the rest
len← output.Length[];
WHILE len > 0 AND output.Fetch[len-1] = SP DO
len← len - 1;
ENDLOOP;
FOR i: INT IN [lastAtomIndex .. len) DO
write[output.Fetch[i]];
ENDLOOP;
}; -- of WriteOutput --
inGroup ← FALSE;
DO -- for each list element.
name← registry← NIL;
dotIndex ← lastAtomIndex ← 0;
lookingFor ← name;
inRoute ← needAtSign ← seenAtSign ← FALSE;
DO -- for tokens within a list element.
BEGIN -- for EXITS --
tokenType: TokenType;
SELECT (tokenType ← GetToken[]) FROM
atom =>
SELECT lookingFor FROM
name, registry =>
BEGIN
IF lookingFor = name THEN
name← name.Concat[token] ELSE registry← registry.Concat[token];
lookingFor ← delim;
IF outputStrm # NIL THEN
{ outputStrm.PutRope[token];
lastAtomIndex ← RopeFromROS[outputStrm, FALSE].Length[]
};
END;
groupContents, routeAddress =>
BEGIN
name← registry← NIL;
needAtSign ← seenAtSign ← FALSE;
name← name.Concat[token];
lookingFor ← delim;
IF outputStrm # NIL THEN
BEGIN
IF lookingFor = groupContents THEN outputStrm.PutChar[SP];
outputStrm.PutRope[token];
lastAtomIndex← RopeFromROS[outputStrm, FALSE].Length[];
END;
END;
abnormal cases follow.
delim => GO TO SecondChance;
ENDCASE;
dot =>
IF lookingFor = delim THEN
BEGIN
AppendNameAndRegistry[];
lookingFor ← registry;
IF outputStrm # NIL THEN
{ dotIndex← RopeFromROS[outputStrm, FALSE].Length[];
outputStrm.PutChar['.]
};
END
ELSE SyntaxError;
atSign =>
SELECT lookingFor FROM
delim =>
BEGIN
IF seenAtSign THEN SyntaxError;
seenAtSign ← TRUE;
AppendNameAndRegistry[];
name← name.Concat["@"];
lookingFor ← name;
dotIndex ← lastAtomIndex ← 0;
IF outputStrm # NIL THEN outputStrm.PutChar['@];
END;
routeAddress =>
ProcessRoute[];
name, registry, groupContents => SyntaxError;
ENDCASE;
comma, endOfLine, endOfList, endOfInput =>
BEGIN
IF inRoute OR (tokenType # comma AND inGroup) THEN SyntaxError;
SELECT lookingFor FROM
delim =>
BEGIN
CheckForArpa[];
[reg, doWrite]← process[name, registry, FALSE, inGroup];
IF doWrite AND outputStrm # NIL THEN WriteOutput[];
IF outputStrm # NIL THEN outputStrm.Reset[];
IF tokenType = comma THEN EXIT ELSE RETURN;
END;
name, groupContents =>
BEGIN
IF inRoute OR seenAtSign THEN SyntaxError;
IF registry.Length[] > 0 THEN ERROR;
IF tokenType = comma THEN {lookingFor ← name; EXIT}
ELSE IF inGroup THEN SyntaxError ELSE RETURN;
END;
registry, routeAddress => SyntaxError;
ENDCASE;
END;
colon =>
SELECT lookingFor FROM
delim, registry =>
BEGIN
IF inRoute OR inGroup OR seenAtSign THEN SyntaxError;
AppendNameAndRegistry[];
inGroup ← TRUE;
lookingFor ← groupContents;
IF outputStrm # NIL THEN outputStrm.PutChar[':];
dotIndex ← lastAtomIndex ← 0;
END;
name, groupContents, routeAddress => SyntaxError;
ENDCASE;
semiColon =>
BEGIN
emptyGroup: BOOLFALSE;
IF ~inGroup OR inRoute THEN SyntaxError;
inGroup ← FALSE;
SELECT lookingFor FROM
groupContents => {IF name.Length[] = 0 THEN SyntaxError; emptyGroup ← TRUE};
delim => {CheckForArpa[]; emptyGroup ← FALSE};
name => IF name.Length[] > 0 THEN SyntaxError;
registry, routeAddress => SyntaxError;
ENDCASE;
IF outputStrm # NIL THEN outputStrm.PutChar[';];
SELECT (tokenType ← GetToken[]) FROM
comma, endOfLine, endOfList, endOfInput =>
BEGIN
IF name.Length[] > 0 THEN
BEGIN
be careful, might have no name if extra trailing comma preceded the semi.
ASSERT: CheckForArpa was called if emptyGroup is FALSE.
[reg, doWrite]← process[name, registry, emptyGroup, TRUE];
IF doWrite AND outputStrm # NIL THEN WriteOutput[];
END
ELSE IF outputStrm # NIL THEN
BEGIN
registry← NIL;
lastAtomIndex ← dotIndex ← 0;
WriteOutput[];
END;
IF outputStrm # NIL THEN outputStrm.Reset[];
IF tokenType = comma THEN EXIT ELSE RETURN;
END;
ENDCASE => SyntaxError;
END;
openRoute =>
BEGIN
IF inRoute THEN SyntaxError;
inRoute ← TRUE;
name← registry← NIL;
dotIndex ← lastAtomIndex ← 0;
lookingFor ← routeAddress;
IF outputStrm # NIL THEN
BEGIN
len: INT;
r: ROPE;
IF (len← (r← RopeFromROS[outputStrm, FALSE]).Length[]) > 0 THEN
IF r.Fetch[len-1] # SP THEN outputStrm.PutChar[SP];
outputStrm.PutChar['<];
END;
END;
closeRoute =>
BEGIN
IF ~inRoute OR lookingFor # delim THEN SyntaxError;
inRoute ← FALSE;
IF needAtSign AND ~seenAtSign THEN SyntaxError;
CheckForArpa[];
IF outputStrm # NIL THEN outputStrm.PutChar['>];
SELECT (tokenType ← GetToken[]) FROM
comma, endOfLine, endOfList, endOfInput, semiColon =>
BEGIN
IF inGroup AND ((tokenType # comma) AND (tokenType # semiColon))
THEN SyntaxError;
this next line is a kludge of the worse kind!!!
IF outputStrm # NIL AND tokenType=semiColon THEN outputStrm.PutChar[';];
[reg, doWrite]← process[name, registry, FALSE, TRUE];
IF doWrite AND outputStrm # NIL THEN WriteOutput[];
IF outputStrm # NIL THEN outputStrm.Reset[];
IF tokenType = semiColon THEN {inGroup← FALSE; EXIT};
IF tokenType = comma THEN EXIT ELSE RETURN;
END;
ENDCASE => SyntaxError;
END;
domainLiteral =>
SELECT lookingFor FROM
name, registry =>
BEGIN
IF ~seenAtSign THEN SyntaxError;
IF lookingFor = name THEN
name← name.Concat[token] ELSE registry← registry.Concat[token];
lookingFor← delim;
IF outputStrm # NIL THEN
{ outputStrm.PutRope[token];
dotIndex← 0;
lastAtomIndex← RopeFromROS[outputStrm, FALSE].Length[]
};
END;
delim, groupContents, routeAddress => SyntaxError;
ENDCASE;
ENDCASE => ERROR;
EXITS
SecondChance => ProcessPhrase[];
END; -- of EXITS block --
ENDLOOP;
ENDLOOP; -- for each list element.
}; -- of ParseList --
Body of ParseNameList
IF outputStrm # NIL THEN outputStrm.Reset[];
ParseList[ ! SyntaxError => {error ← badFieldBody; CONTINUE}];
IF error # none THEN ERROR ParseError[error];
}; -- of ParseNameList --
Get: PROC [ph: ParseHandle, next: PROC RETURNS[CHAR]] RETURNS [char: CHAR] = {
Obtains next input character and smoothes over a few lexical quirks. This procedure
deals with Arpa-standard line-folding.
IF ph.giveACR THEN {ph.giveACR ← FALSE; RETURN[CR]};
IF ph.char = NUL THEN
BEGIN
IF (char ← next[]) = CR THEN
BEGIN
ph.char ← next[];
SELECT ph.char FROM
SP, TAB => {ph.char ← NUL; RETURN[SP]};
ENDCASE;
END;
END
ELSE {char ← ph.char; ph.char ← NUL};
}; -- of Get --
END.