-- File: GVMailParseImpl.mesa -- Based on ArpaMailParser.mesa by Brotz, March 6, 1983 3:29 PM -- Last Edited by: Willie-sue, May 16, 1983 9:31 am -- Last Edited by: Woosh, April 21, 1983 4:57 pm DIRECTORY GVMailParse USING [endOfInput, endOfList, ParseErrorCode], IO, Rope; GVMailParseImpl: CEDAR PROGRAM IMPORTS IO, Rope EXPORTS GVMailParse = BEGIN OPEN GVMailParse, Rope, IO; -- Types -- -- Some special characters specified by RFC 822 -- openComment: CHAR = '(; closeComment: CHAR = '); quoteNext: CHAR = '\\; quotes: CHAR = '"; comma: CHAR = ',; colon: CHAR = ':; semiColon: CHAR = ';; openRoute: CHAR = '<; closeRoute: CHAR = '>; openSquareBracket: CHAR = '[; closeSquareBracket: CHAR = ']; dot: CHAR = '.; atSign: CHAR = '@; TokenType: TYPE = {atom, dot, atSign, comma, colon, semiColon, openRoute, closeRoute, domainLiteral, endOfLine, endOfList, endOfInput}; ParseInfo: PUBLIC TYPE = RECORD [ strm, tokenStrm: IO.STREAM, giveACR: BOOL _ FALSE, char: CHAR _ NUL]; ParseHandle: TYPE = REF ParseInfo; ParseError: PUBLIC ERROR [code: ParseErrorCode] = CODE; SyntaxError: ERROR = CODE; InitializeParse: PUBLIC PROC RETURNS [ph: ParseHandle] = -- Initializes the header parser, and returns a ParseHandle which is to be passed to all other -- procedures of this interface. Subsequent invocations of GetFieldName, GetFieldBody, -- and ParseNameList will obtain their input using "next". If "notifyTruncation" is TRUE, -- GetFieldName and GetFieldBody will raise ParseError[truncated] if the string they are -- collecting overflows the string provided. (The signal is not raised until the entire field -- name or body has been scanned.) If "notifyTruncation" is FALSE, this signal is -- suppressed. BEGIN ph _ NEW[ParseInfo_ [strm: CreateOutputStreamToRope[], tokenStrm: CreateOutputStreamToRope[]]]; END; -- of InitializeParse -- FinalizeParse: PUBLIC PROC [ph: ParseHandle] = -- Finalizes the parser instance specified by "pH". This procedure must be called when the -- client has finished parsing, either because of normal completion or because some error -- has occurred. After calling this procedure, "pH" is no longer meaningful and must not -- be reused. Note: FinalizeParse may not be called while a call to ParseNameList is -- pending (for the same ParseHandle). BEGIN ph.strm.Close[]; ph.tokenStrm.Close[]; END; -- of FinalizeParse -- GetFieldName: PUBLIC PROC [ph: ParseHandle, next: PROC RETURNS [CHAR]] RETURNS [fieldName: ROPE, found: BOOL] = -- GetFieldName presumes that "next" (see InitializeParse) is positioned to read the first -- character of a field name and returns the field name, without the terminating colon, -- as "fieldName". GetFieldName leaves "next" ready to return the first character -- following the colon (or, if the end of the message header has been reached, the -- character (if any) after the two CRs that normally terminate the header). -- Upon return, "found" is FALSE if no field names remain in the header. -- If the header field ends prematurely or illegal header characters are encountered, -- ParseError[badFieldName] is raised. BEGIN char: CHAR; blanks: BOOL _ FALSE; ph.strm.Reset[]; DO SELECT char _ Get[ph, next] FROM CR, endOfInput => { fieldName_ GetOutputStreamRope[ph.strm]; IF fieldName.Length[] = 0 THEN RETURN[fieldName, FALSE] ELSE ParseError[badFieldName]; }; ': => RETURN[GetOutputStreamRope[ph.strm], TRUE]; SP, TAB => blanks _ TRUE; endOfList, < 40C => ERROR ParseError[badFieldName]; ENDCASE => SELECT TRUE FROM blanks => ERROR ParseError[badFieldName]; ENDCASE => ph.strm.PutChar[char]; ENDLOOP; END; -- of GetFieldName -- GetFieldBody: PUBLIC PROC [ph: ParseHandle, next: PROC RETURNS [CHAR], suppressBody: BOOL_ FALSE, suppressWhiteSpace: BOOL _ FALSE] RETURNS [fieldBody: ROPE] = -- The (remainder of the) current field body is read using "next" (see InitializeParse) -- and is returned as "fieldBody". If the field body terminates before a CR is seen, -- ParseError[badFieldBody] is raised. Upon return, "fieldBody" has no initial or -- terminal white space (blanks and tabs) and, if "suppressWhiteSpace" is TRUE, -- each internal run of white space has been replaced by a single blank. -- ArpaNet folding conventions are also observed. -- IF suppressBody is TRUE, then no fieldBody is generated BEGIN char: CHAR; spaceSeen: BOOL _ TRUE; -- TRUE means ignore leading spaces RemoveTrailingSpace: PROC = INLINE BEGIN len: INT_ fieldBody.Length[]; WHILE len > 0 AND fieldBody.Fetch[len-1] = SP DO len_ len - 1; ENDLOOP; fieldBody_ fieldBody.Substr[0, len]; END; -- of RemoveTrailingSpace -- IF ~suppressBody THEN ph.strm.Reset[]; DO SELECT char _ Get[ph, next] FROM SP, TAB => BEGIN IF spaceSeen THEN LOOP; IF suppressWhiteSpace THEN {char _ SP; spaceSeen _ TRUE}; END; endOfInput => GO TO Trouble; endOfList, CR => EXIT; ENDCASE => spaceSeen _ FALSE; IF ~suppressBody THEN ph.strm.PutChar[char]; ENDLOOP; IF ~suppressBody THEN { fieldBody_ GetOutputStreamRope[ph.strm]; RemoveTrailingSpace[]}; EXITS Trouble => ERROR ParseError[badFieldBody]; END; -- of GetFieldBody -- ParseNameList: PUBLIC PROC [ph: ParseHandle, next: PROC RETURNS[CHAR], process: PROC [ROPE, ROPE, BOOL, BOOL] RETURNS [ROPE, BOOL], -- process PROC [simpleName, registry, isFile, isNested] RETURNS [reg, write] -- write: PROC [CHAR] _ NIL ] = -- ParseNameList expects to read characters using "next" (see InitializeParse) for a structured -- field body consisting of a list of recipient names. For each such name encountered, it -- will call "process", passing it two string arguments that designate the simple name and -- registry. The simple name is always non-empty. If the registry is absent, a string of -- length zero (not NIL) is passed. If the simple name contains an Arpanet host name, -- the registry passed is "Arpa". The rope parameters are free from leading, trailing -- and excess internal white space The "process" routine has a third parameter that indicates, -- if TRUE, that the simple name is a file name, if FALSE, that the simple name and -- registry combine to form a normal name. The fourth parameter supplied to "process" -- indicates, if TRUE, that the name was "nested", i.e., it occurred within brackets or -- within a group. This is useful to the Answer client who may wish to suppress -- duplicate elimination in such cases. -- If any syntax errors are detected during parsing, ParseError[badFieldBody] is raised. It is -- legitimate for the "process" routine to raise a signal that causes ParseNameList to be -- unwound. BEGIN error: ParseErrorCode _ none; name, registry, token: ROPE; reg: ROPE; doWrite: BOOL; outputStrm: IO.STREAM_ IF write = NIL THEN NIL ELSE ph.strm; tokenStrm: IO.STREAM_ ph.tokenStrm; dotIndex, lastAtomIndex: CARDINAL _ 0; -- Local procedure ParseList does all the work. It is a local procedure so we can catch -- its ERRORs easily. ParseList: PROC = BEGIN lookingFor: {name, delim, registry, groupContents, routeAddress}; -- Semantics of states: -- name: expect the first atom of a name or the first atom of a domain. -- these can be distinguished by the seenAtSign Boolean. -- delim: have just seen an atom, expect a delimiter: dot, atSign, closeRoute, semiColon, -- endOfLine, or endOfList. -- registry: have just seen a dot, now expect an atom that is a registry candidate. -- groupContents: have just seen a group opening colon. This state is ephemeral-we are -- looking for an immediate semiColon that will indicate that the name accumulated so far -- is a filename. If the next token is not a semiColon, treat as lookingFor=name (but clear -- the accumulated name first). -- routeAddress: have just seen an openRoute. Handle the awful route syntax: -- @domain,@domain, ... @domain: -- If not immediately followed by an atSign, treat as lookingFor=name. -- Typical sequence of states is -- name, delim, (if see dot:)[ registry, delim, (if see atSign:)[name, delim,[registry, delim]*]] inRoute, inGroup, needAtSign, seenAtSign: BOOL; oldRegistryLength: CARDINAL; haveAlreadyWritten: BOOL _ FALSE; AppendNameAndRegistry: PROC = BEGIN IF registry.Length[] > 0 THEN {name_ name.Cat[".", registry]; registry_ NIL}; END; -- of AppendNameAndRegistry -- CheckForArpa: PROC = BEGIN numArpaAliases: CARDINAL = 3; arpaAliases: ARRAY [0..numArpaAliases) OF ROPE = [ -- The first one is the preferred registry for ARPA recipients, but any of -- the others is acceptable and is left unchanged if present. Note that -- the name "ARPA" is overloaded: it is used as both a Xerox registry name -- and a top-level ARPA domain name. "AG", "ArpaGateway", "ARPA"]; IF seenAtSign THEN FOR i: CARDINAL IN [0..numArpaAliases) DO IF registry.Equal[arpaAliases[i], FALSE] THEN EXIT; REPEAT FINISHED => BEGIN AppendNameAndRegistry[]; registry_ arpaAliases[0]; dotIndex _ 0; END; ENDLOOP ELSE IF registry.Length[] = 0 THEN dotIndex _ 0; oldRegistryLength _ registry.Length[]; END; -- of CheckForArpa -- ProcessPhrase: PROC = BEGIN IF inRoute THEN SyntaxError; name_ registry_ NIL; IF outputStrm # NIL THEN {outputStrm.PutChar[SP]; outputStrm.PutRope[token]}; DO -- Flush till colon or angle bracket. SELECT GetToken[] FROM colon => BEGIN inGroup _ TRUE; lookingFor _ groupContents; IF outputStrm # NIL THEN outputStrm.PutChar[':]; EXIT; END; openRoute => BEGIN lookingFor _ routeAddress; inRoute _ TRUE; IF outputStrm # NIL THEN {outputStrm.PutChar[SP]; outputStrm.PutChar['<]}; EXIT; END; atom => IF outputStrm # NIL THEN {outputStrm.PutChar[SP]; outputStrm.PutRope[token]}; dot => IF outputStrm # NIL THEN outputStrm.PutChar['.]; ENDCASE => SyntaxError; ENDLOOP; END; -- of ProcessPhrase -- ProcessRoute: PROC = BEGIN name_ name.Concat["@"]; needAtSign _ TRUE; IF outputStrm # NIL THEN outputStrm.PutChar['@]; DO SELECT GetToken[] FROM atom, domainLiteral => BEGIN name_ name.Concat[token]; IF outputStrm # NIL THEN outputStrm.PutRope[token]; END; ENDCASE => SyntaxError; SELECT GetToken[] FROM dot => BEGIN name_ name.Concat["."]; IF outputStrm # NIL THEN outputStrm.PutChar['.]; END; comma => IF GetToken[] = atSign THEN BEGIN name_ name.Concat[",@"]; IF outputStrm # NIL THEN outputStrm.PutRope[",@"]; END ELSE SyntaxError; colon => BEGIN name_ name.Concat[":"]; IF outputStrm # NIL THEN outputStrm.PutChar[':]; lookingFor _ name; EXIT; END; ENDCASE => SyntaxError; ENDLOOP; END; -- of ProcessRoute -- GetAtom: PROC = BEGIN tooLong: BOOL _ FALSE; DO char: CHAR; SELECT char _ Get[ph, next] FROM SP, TAB => EXIT; dot, atSign, comma, openRoute, closeRoute, endOfList, colon, semiColon, endOfInput, openSquareBracket, openComment, quotes, closeSquareBracket, closeComment => {ph.char _ char; EXIT}; CR => {ph.giveACR _ TRUE; EXIT}; < 40C, quoteNext, DEL => ERROR SyntaxError; ENDCASE => tokenStrm.PutChar[char]; ENDLOOP; END; -- of GetAtom -- GetQuotedString: PROC = BEGIN tooLong: BOOL _ FALSE; DO char: CHAR; tokenStrm.PutChar[char _ Get[ph, next]]; SELECT char FROM quoteNext => SELECT char _ Get[ph, next] FROM CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE => tokenStrm.PutChar[char]; quotes => EXIT; CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE; ENDLOOP; END; -- of GetQuotedString -- GetDomainLiteral: PROC = BEGIN DO char: CHAR; tokenStrm.PutChar[char _ Get[ph, next]]; SELECT char FROM quoteNext => SELECT char _ Get[ph, next] FROM CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE => tokenStrm.PutChar[char]; closeSquareBracket => RETURN; CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE; ENDLOOP; EXITS END; -- of GetDomainLiteral -- GetToken: PROC RETURNS [tokenType: TokenType] = BEGIN char: CHAR; tokenStrm.Reset[]; DO char _ Get[ph, next]; SELECT char FROM SP, TAB => LOOP; openComment => {FlushComment[]; LOOP}; ENDCASE => EXIT; ENDLOOP; SELECT char FROM dot => RETURN[dot]; atSign => RETURN[atSign]; comma => RETURN[comma]; openRoute => RETURN[openRoute]; closeRoute => RETURN[closeRoute]; CR => RETURN[endOfLine]; endOfList => RETURN[endOfList]; endOfInput => RETURN[endOfInput]; colon => RETURN[colon]; semiColon => RETURN[semiColon]; closeSquareBracket, closeComment => ERROR SyntaxError; openSquareBracket => {tokenStrm.PutChar[char]; GetDomainLiteral[]; tokenType_ domainLiteral}; quotes => {tokenStrm.PutChar[char]; GetQuotedString[]; tokenType_ atom}; ENDCASE => {tokenStrm.PutChar[char]; GetAtom[]; tokenType_ atom}; token_ GetOutputStreamRope[tokenStrm]; END; -- of GetToken -- FlushComment: PROC = BEGIN char: CHAR; len: INT; IF outputStrm # NIL THEN BEGIN r: ROPE_ GetOutputStreamRope[outputStrm]; IF (len_ r.Length[]) > 0 THEN IF r.Fetch[len-1] # SP THEN outputStrm.PutChar[SP]; outputStrm.PutChar['(]; END; DO SELECT (char _ Get[ph, next]) FROM quoteNext => SELECT (char _ Get[ph, next]) FROM CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE => {outputStrm.PutChar['\\]; outputStrm.PutChar[char]}; closeComment => {IF outputStrm # NIL THEN outputStrm.PutRope[") "]; RETURN}; openComment => FlushComment[]; CR, endOfList, endOfInput => ERROR SyntaxError; ENDCASE => IF outputStrm # NIL THEN outputStrm.PutChar[char]; ENDLOOP; END; -- of FlushComment -- WriteOutput: PROC = BEGIN -- (dotIndex # 0) = registry exists in the output string. -- locate the registry in the output string output: ROPE_ GetOutputStreamRope[outputStrm]; len, regLen: INT; registryIndex: CARDINAL _ IF dotIndex # 0 THEN lastAtomIndex - oldRegistryLength ELSE 0; IF haveAlreadyWritten THEN {write[',]; write[SP]}; haveAlreadyWritten _ TRUE; -- write up to the dot FOR i: CARDINAL IN [0 .. IF dotIndex = 0 THEN lastAtomIndex ELSE dotIndex) DO write[output.Fetch[i]]; ENDLOOP; -- write the dot IF (regLen_ reg.Length[]) > 0 THEN write['.]; -- write the registry FOR i: CARDINAL IN [0 .. regLen) DO write[reg.Fetch[i]]; ENDLOOP; -- write the comments between the dot and the registry IF dotIndex # 0 THEN FOR i: CARDINAL IN (dotIndex .. registryIndex) DO write[output.Fetch[i]]; ENDLOOP; -- write the rest len_ output.Length[]; WHILE len > 0 AND output.Fetch[len-1] = SP DO len_ len - 1; ENDLOOP; FOR i: CARDINAL IN [lastAtomIndex .. len) DO write[output.Fetch[i]]; ENDLOOP; END; -- of WriteOutput -- inGroup _ FALSE; DO -- for each list element. name_ registry_ NIL; dotIndex _ lastAtomIndex _ 0; lookingFor _ name; inRoute _ needAtSign _ seenAtSign _ FALSE; DO -- for tokens within a list element. BEGIN -- for EXITS -- tokenType: TokenType; SELECT (tokenType _ GetToken[]) FROM atom => SELECT lookingFor FROM name, registry => BEGIN IF lookingFor = name THEN name_ name.Concat[token] ELSE registry_ registry.Concat[token]; lookingFor _ delim; IF outputStrm # NIL THEN { outputStrm.PutRope[token]; lastAtomIndex _ GetOutputStreamRope[outputStrm].Length[] }; END; groupContents, routeAddress => BEGIN name_ registry_ NIL; needAtSign _ seenAtSign _ FALSE; name_ name.Concat[token]; lookingFor _ delim; IF outputStrm # NIL THEN BEGIN IF lookingFor = groupContents THEN outputStrm.PutChar[SP]; outputStrm.PutRope[token]; lastAtomIndex_ GetOutputStreamRope[outputStrm].Length[]; END; END; -- abnormal cases follow. delim => GO TO SecondChance; ENDCASE; dot => IF lookingFor = delim THEN BEGIN AppendNameAndRegistry[]; lookingFor _ registry; IF outputStrm # NIL THEN { dotIndex_ GetOutputStreamRope[outputStrm].Length[]; outputStrm.PutChar['.] }; END ELSE SyntaxError; atSign => SELECT lookingFor FROM delim => BEGIN IF seenAtSign THEN SyntaxError; seenAtSign _ TRUE; AppendNameAndRegistry[]; name_ name.Concat["@"]; lookingFor _ name; dotIndex _ lastAtomIndex _ 0; IF outputStrm # NIL THEN outputStrm.PutChar['@]; END; routeAddress => ProcessRoute[]; name, registry, groupContents => SyntaxError; ENDCASE; comma, endOfLine, endOfList, endOfInput => BEGIN IF inRoute OR (tokenType # comma AND inGroup) THEN SyntaxError; SELECT lookingFor FROM delim => BEGIN CheckForArpa[]; [reg, doWrite]_ process[name, registry, FALSE, inGroup]; IF doWrite AND outputStrm # NIL THEN WriteOutput[]; IF outputStrm # NIL THEN outputStrm.Reset[]; IF tokenType = comma THEN EXIT ELSE RETURN; END; name, groupContents => BEGIN IF inRoute OR seenAtSign THEN SyntaxError; IF registry.Length[] > 0 THEN ERROR; IF tokenType = comma THEN {lookingFor _ name; EXIT} ELSE IF inGroup THEN SyntaxError ELSE RETURN; END; registry, routeAddress => SyntaxError; ENDCASE; END; colon => SELECT lookingFor FROM delim, registry => BEGIN IF inRoute OR inGroup OR seenAtSign THEN SyntaxError; AppendNameAndRegistry[]; inGroup _ TRUE; lookingFor _ groupContents; IF outputStrm # NIL THEN outputStrm.PutChar[':]; dotIndex _ lastAtomIndex _ 0; END; name, groupContents, routeAddress => SyntaxError; ENDCASE; semiColon => BEGIN emptyGroup: BOOL _ FALSE; IF ~inGroup OR inRoute THEN SyntaxError; inGroup _ FALSE; SELECT lookingFor FROM groupContents => {IF name.Length[] = 0 THEN SyntaxError; emptyGroup _ TRUE}; delim => {CheckForArpa[]; emptyGroup _ FALSE}; name => IF name.Length[] > 0 THEN SyntaxError; registry, routeAddress => SyntaxError; ENDCASE; IF outputStrm # NIL THEN outputStrm.PutChar[';]; SELECT (tokenType _ GetToken[]) FROM comma, endOfLine, endOfList, endOfInput => BEGIN IF name.Length[] > 0 THEN BEGIN -- be careful, might have no name if extra trailing comma preceded the semi. -- ASSERT: CheckForArpa was called if emptyGroup is FALSE. [reg, doWrite]_ process[name, registry, emptyGroup, TRUE]; IF doWrite AND outputStrm # NIL THEN WriteOutput[]; END ELSE IF outputStrm # NIL THEN BEGIN registry_ NIL; lastAtomIndex _ dotIndex _ 0; WriteOutput[]; END; IF outputStrm # NIL THEN outputStrm.Reset[]; IF tokenType = comma THEN EXIT ELSE RETURN; END; ENDCASE => SyntaxError; END; openRoute => BEGIN IF inRoute THEN SyntaxError; inRoute _ TRUE; name_ registry_ NIL; dotIndex _ lastAtomIndex _ 0; lookingFor _ routeAddress; IF outputStrm # NIL THEN BEGIN len: INT; r: ROPE; IF (len_ (r_ GetOutputStreamRope[outputStrm]).Length[]) > 0 THEN IF r.Fetch[len-1] # SP THEN outputStrm.PutChar[SP]; outputStrm.PutChar['<]; END; END; closeRoute => BEGIN IF ~inRoute OR lookingFor # delim THEN SyntaxError; inRoute _ FALSE; IF needAtSign AND ~seenAtSign THEN SyntaxError; CheckForArpa[]; IF outputStrm # NIL THEN outputStrm.PutChar['>]; SELECT (tokenType _ GetToken[]) FROM comma, endOfLine, endOfList, endOfInput, semiColon => BEGIN IF inGroup AND ((tokenType # comma) AND (tokenType # semiColon)) THEN SyntaxError; -- this next line is a kludge of the worse kind!!! IF outputStrm # NIL AND tokenType=semiColon THEN outputStrm.PutChar[';]; [reg, doWrite]_ process[name, registry, FALSE, TRUE]; IF doWrite AND outputStrm # NIL THEN WriteOutput[]; IF outputStrm # NIL THEN outputStrm.Reset[]; IF tokenType = semiColon THEN {inGroup_ FALSE; EXIT}; IF tokenType = comma THEN EXIT ELSE RETURN; END; ENDCASE => SyntaxError; END; domainLiteral => SELECT lookingFor FROM name, registry => BEGIN IF ~seenAtSign THEN SyntaxError; IF lookingFor = name THEN name_ name.Concat[token] ELSE registry_ registry.Concat[token]; lookingFor_ delim; IF outputStrm # NIL THEN { outputStrm.PutRope[token]; dotIndex_ 0; lastAtomIndex_ GetOutputStreamRope[outputStrm].Length[] }; END; delim, groupContents, routeAddress => SyntaxError; ENDCASE; ENDCASE => ERROR; EXITS SecondChance => ProcessPhrase[]; END; -- of EXITS block -- ENDLOOP; ENDLOOP; -- for each list element. END; -- of ParseList -- -- Body of ParseNameList -- IF outputStrm # NIL THEN outputStrm.Reset[]; ParseList[ ! SyntaxError => {error _ badFieldBody; CONTINUE}]; IF error # none THEN ERROR ParseError[error]; END; -- of ParseNameList -- Get: PROC [ph: ParseHandle, next: PROC RETURNS[CHAR]] RETURNS [char: CHAR] = -- Obtains next input character and smoothes over a few lexical quirks. This procedure -- deals with Arpa-standard line-folding. BEGIN IF ph.giveACR THEN {ph.giveACR _ FALSE; RETURN[CR]}; IF ph.char = NUL THEN BEGIN IF (char _ next[]) = CR THEN BEGIN ph.char _ next[]; SELECT ph.char FROM SP, TAB => {ph.char _ NUL; RETURN[SP]}; ENDCASE; END; END ELSE {char _ ph.char; ph.char _ NUL}; END; -- of Get -- END. -- of MailParseImpl --