<> <> <> <> <> <> <> DIRECTORY Ascii USING [Lower, Upper], List USING [Append, Nconc, Reverse], LooksReader USING [Create, Get, SetPosition], RegularExpression USING [altSepToken, anyToken, beginAllToken, beginAltToken, beginClassToken, beginFieldToken, beginNodeToken, boundSepToken, CharClass, CharClassContent, ClassArray, closureToken, Compile, endAllToken, endAltToken, endClassToken, endFieldToken, endPatternToken, fieldSepToken, FinderRecord, greedyClosureToken, greedyPlusToken, IgnoreLooks, Index, LegalInputCharacters, NameArray, nodeBreakToken, notToken, OptimizeBackwardSearch, OptimizeForwardSearch, ParseTree, ParseTreeContent, ParseTypes, PatternErrorCode, PatternStackArray, plusToken, powerToken, ReturnCodeArray, StackContent, subRangeToken, TextStackArray], Rope USING [Concat, Equal, Fetch, FromChar, ROPE, Size], RopeReader USING [Create, Get, GetIndex, ReadOffEnd, SetPosition], RunReader USING [NoMoreRuns], RuntimeError USING [BoundsFault], TextEdit USING [GetRope, GetRuns], TextLooks USING [Looks, noLooks, Runs], TextNode USING [Offset, RefTextNode]; RegExpFindImpl: CEDAR PROGRAM IMPORTS Ascii, List, LooksReader, RegularExpression, Rope, RopeReader, RunReader, RuntimeError, TextEdit EXPORTS RegularExpression = { OPEN RegularExpression; RegExpPatternErrorCode: TYPE = { tooBig, -- The pattern is too big. illegalCharacter, improperAltSeparator, notInsideAlt, notInsideField, moreThanOneBeginAll, noMatchingBeginAll, nameMustBeAString, theAllNameIsReserved, secondOccurenceOfFieldMustNotContainPattern, expectedEndOfField, unexpectedEndOfPattern, noClosingEndAll, invalidNot, illegalOctalNumber, unknownAbbreviation }; MalformedPattern: PUBLIC ERROR [ec: RegularExpression.PatternErrorCode] = CODE; MaxLen: Offset = LAST[Offset]; Finder: TYPE = REF FinderRec; FinderRec: PUBLIC TYPE = FinderRecord; Offset: TYPE = TextNode.Offset; ROPE: TYPE = Rope.ROPE; RefTextNode: TYPE = TextNode.RefTextNode; <<***** Operations *****>> NameLoc: PUBLIC PROC [finder: Finder, name: ROPE] RETURNS [at, atEnd: Offset] = { nameArray: REF NameArray _ IF finder # NIL THEN finder.nameArray ELSE NIL; at _ atEnd _ 0; IF nameArray = NIL THEN RETURN; IF Rope.Equal["all", name, FALSE] THEN RETURN [nameArray[0].at, nameArray[0].atEnd]; FOR i:NAT IN [0..nameArray.length) DO IF Rope.Equal[nameArray[i].name, name] THEN RETURN [nameArray[i].at, nameArray[i].atEnd]; ENDLOOP; }; NameLooks: PUBLIC PROC [finder: Finder, name: ROPE] RETURNS [looks: TextLooks.Looks] = { nameArray: REF NameArray _ IF finder # NIL THEN finder.nameArray ELSE NIL; looks _ TextLooks.noLooks; IF nameArray = NIL THEN RETURN; IF Rope.Equal["all", name, FALSE] THEN RETURN [nameArray[0].looks]; FOR i:NAT IN [0..nameArray.length) DO IF Rope.Equal[nameArray[i].name, name] THEN RETURN [nameArray[i].looks]; ENDLOOP; }; Create: PUBLIC PROC [pattern: RefTextNode, literal, word, ignoreLooks, ignoreCase, addBounds: BOOLEAN, patternStart: INT _ 0, patternLen: INT _ LAST[INT]] RETURNS [finder: Finder] = { patternRope: ROPE _ TextEdit.GetRope[pattern]; patternRuns: TextLooks.Runs _ TextEdit.GetRuns[pattern]; RETURN [CreateFromParts[patternRope,patternRuns,literal,word, ignoreLooks,ignoreCase,addBounds,patternStart,patternLen]]; }; CreateFromRope: PUBLIC PROC [pattern: ROPE, literal, word, ignoreCase, addBounds: BOOLEAN, patternStart: Offset, patternLen: Offset] RETURNS [finder: Finder] = { RETURN [CreateFromParts[pattern,NIL,literal,word,TRUE, ignoreCase,addBounds,patternStart,patternLen]] }; CreateFromParts: PROC [patternRope: ROPE, patternRuns: TextLooks.Runs, literal, word, ignoreLooks, ignoreCase, addBounds: BOOLEAN, patternStart: Offset, patternLen: Offset] RETURNS [finder: Finder] = { ENABLE RuntimeError.BoundsFault => ERROR MalformedPattern[toobig]; SimpleSymbolTableEntry: TYPE = RECORD[name: ROPE, number: Index]; SimpleSymbolTable: TYPE = LIST OF REF SimpleSymbolTableEntry; nameList: SimpleSymbolTable; numberOfFields: Index; parsedPatternList: LIST OF ParseTree; forwardPattern, backwardPattern: ParseTree; char, patternChar: CHAR _ 377C; pEnd, pPos: Offset; numCharClasses: Index _ 0; charClassList: LIST OF ParseTree _ NIL; insideNamedPat: BOOLEAN _ FALSE; lastPhysicalCharUnread: BOOL _ FALSE; theLastPhysicalCharUnread: CHAR; inAbbreviation: BOOL _ FALSE; abbreviationPos: Offset; abbreviation: ROPE; lastCharacterRead: CHAR; looksRead: TextLooks.Looks _ TextLooks.noLooks; GetNextChar: PROC [eofOK: BOOL] RETURNS [c: CHAR] = { IF lastPhysicalCharUnread THEN { lastPhysicalCharUnread _ FALSE; RETURN[theLastPhysicalCharUnread]; }; IF inAbbreviation THEN { abbreviationPos _ abbreviationPos + 1; IF abbreviationPos < abbreviation.Size[] THEN c _ abbreviation.Fetch[abbreviationPos] ELSE inAbbreviation _ FALSE; }; IF ~inAbbreviation THEN { IF pPos >= pEnd THEN GOTO gotEnd; pPos _ pPos + 1; c _ finder.ropeReader.Get[! RopeReader.ReadOffEnd => GOTO gotEnd]; IF finder.lksReader = NIL THEN looksRead _ IgnoreLooks ELSE looksRead _ LooksReader.Get[finder.lksReader ! RunReader.NoMoreRuns => {looksRead _ TextLooks.noLooks; CONTINUE }]; }; IF ignoreCase THEN c _ Ascii.Upper[c]; EXITS gotEnd => { c _ endPatternToken; looksRead _ TextLooks.noLooks; IF ~eofOK THEN SyntaxError[unexpectedEndOfPattern]; }; }; UnReadLastPhysicalChar: PROC [c: CHAR] = { IF lastPhysicalCharUnread THEN ERROR; theLastPhysicalCharUnread _ c; lastPhysicalCharUnread _ TRUE; }; AbbreviationRec: TYPE = RECORD[char: CHAR, abbreviation: ROPE]; abbreviations: LIST OF AbbreviationRec _ LIST[ ['A, "([a..zA..Z0..9]++)"], ['B, "([ '011..'015]++)"], ['D, "([0..9]+.[0..9]**|[0..9]*.[0..9]++|[0..9]++)"], ['N, "('015)"], ['Q, "(\"[~\"]*\"|``[~'']*''''|`[~'']*'')"], ['S, "([~ '011..'015]++)"], ['W, "([a..zA..Z]++)"], ['^, "['001..'037]"] ]; SetUpAbbreviation: PROC [c: CHAR] = { IF inAbbreviation THEN ERROR; c _ Ascii.Upper[c]; FOR l: LIST OF AbbreviationRec _ abbreviations, l.rest UNTIL l = NIL DO IF c = l.first.char THEN { inAbbreviation _ TRUE; abbreviationPos _ -1; abbreviation _ l.first.abbreviation; RETURN; }; ENDLOOP; SyntaxError[unknownAbbreviation]; }; charUnRead: BOOL _ FALSE; GetToken: PROC [] RETURNS [char: CHAR] = { IF charUnRead THEN { charUnRead _ FALSE; RETURN[lastCharacterRead]; }; lastCharacterRead _ char _ GetNextChar[TRUE]; IF literal THEN IF char = endPatternToken OR char IN LegalInputCharacters THEN RETURN[char] ELSE SyntaxError[illegalCharacter]; IF char IN ['A..'Z] OR char IN ['a..'z] OR char IN ['0..'9] THEN RETURN[char]; SELECT char FROM '[ => char _ beginClassToken; '] => char _ endClassToken; '~ => char _ notToken; '# => char _ anyToken; '$ => char _ nodeBreakToken; '^ => char _ beginNodeToken; '! => char _ powerToken; '( => char _ beginAltToken; ') => char _ endAltToken; '| => char _ altSepToken; '< => char _ beginFieldToken; '> => char _ endFieldToken; ': => char _ fieldSepToken; ', => char _ boundSepToken; '{ => char _ beginAllToken; '} => char _ endAllToken; '' => { char _ GetNextChar[FALSE]; IF char IN ['0..'7] THEN { c2: CHAR _ GetNextChar[FALSE]; c3: CHAR _ GetNextChar[FALSE]; octalIndex: CARDINAL; IF ~(c2 IN ['0..'7]) OR ~(c3 IN ['0..'7]) THEN SyntaxError[illegalOctalNumber]; octalIndex _ (char-'0)*64 + (c2-'0)*8+(c3-'0); IF octalIndex > 127 THEN SyntaxError[illegalOctalNumber]; char _ VAL[octalIndex]; }; IF ~(char IN LegalInputCharacters) THEN SyntaxError[illegalCharacter]; }; '* => { c: CHAR _ GetNextChar[TRUE]; IF c = '* THEN char _ greedyClosureToken ELSE { UnReadLastPhysicalChar[c]; char _ closureToken; }; }; '+ => { c: CHAR _ GetNextChar[TRUE]; IF c = '+ THEN char _ greedyPlusToken ELSE { UnReadLastPhysicalChar[c]; char _ plusToken; }; }; '. => { c: CHAR _ GetNextChar[TRUE]; IF c = '. THEN char _ subRangeToken ELSE { UnReadLastPhysicalChar[c]; char _ '.; }; }; '\\ => { c: CHAR _ GetNextChar[FALSE]; SetUpAbbreviation[c]; RETURN[GetToken[]]; }; endPatternToken => NULL; ENDCASE => IF ~(char IN LegalInputCharacters) THEN SyntaxError[illegalCharacter]; lastCharacterRead _ char; RETURN[char]; }; UnReadToken: PROC[] = { IF charUnRead THEN ERROR; charUnRead _ TRUE; }; SyntaxError: PROC[kind: RegExpPatternErrorCode] = { ERROR MalformedPattern[toobig]; <> }; <> ParseCharClass: PROC [] RETURNS [r: ParseTree] = { ccr: REF ParseTreeContent.class _ NEW[ParseTreeContent.class]; complement: BOOL _ FALSE; c, lastChar: CHAR _ beginClassToken; charClass: CharClass _ NEW[CharClassContent _ ALL[FALSE]]; charClassList _ CONS[ccr, charClassList]; ccr.classNumber _ numCharClasses; numCharClasses _ numCharClasses + 1; IF GetToken[] = notToken THEN complement _ TRUE ELSE UnReadToken[]; ccr.looks _ looksRead; ccr.class _ charClass; r _ ccr; WHILE (c _ GetToken[]) # endClassToken DO SELECT c FROM subRangeToken => { IF ~(lastChar IN LegalInputCharacters) THEN SyntaxError[illegalCharacter]; c _ GetToken[]; IF ~(c IN LegalInputCharacters) THEN SyntaxError[illegalCharacter]; FOR x: CHAR IN [lastChar..c] DO charClass[x] _ TRUE; ENDLOOP; lastChar _ subRangeToken; }; NOT IN LegalInputCharacters => SyntaxError[illegalCharacter] ENDCASE => { lastChar _ c; charClass[c] _ TRUE; }; ENDLOOP; IF ignoreCase THEN FOR x: CHAR IN ['a..'z] DO SELECT TRUE FROM charClass[x] => charClass[x-'a+'A] _ TRUE; charClass[x-'a+'A] => charClass[x] _ TRUE; ENDCASE => NULL; ENDLOOP; IF complement THEN FOR x: CHAR IN LegalInputCharacters DO charClass[x] _ ~charClass[x]; ENDLOOP; }; <> <\ .. >> <<'special character One of the above. (Handled in tokenizer)>> <<'nnn octal control characters. (Handled in tokenizer)>> <<[character class] Character class notation, A..Z means ASCII interval A..Z>> <<[~character class] Not the characters in this class.>> <<# Any character.>> <<$ Node break.>> <<^ Beginning of node.>> <<\x Predefined patterns; x is an alphanumeric character, or . (Handled in tokenizer)>> ParseX: PROC [] RETURNS [p: ParseTree _ NIL] = { c: CHAR _ GetToken[]; SELECT c FROM IN LegalInputCharacters => IF ignoreCase AND c IN ['A..'Z] THEN p _ NEW[ParseTreeContent.charIC _ [charIC[c, looksRead]]] ELSE p _ NEW[ParseTreeContent.char _ [char[c, looksRead]]]; beginClassToken => p _ ParseCharClass[]; anyToken => p _ NEW[ParseTreeContent.anyChar _ [anyChar[looksRead]]]; nodeBreakToken => p _ NEW[ParseTreeContent.nodeBreak]; beginNodeToken => p _ NEW[ParseTreeContent.beginNode]; ENDCASE => UnReadToken[]; }; <> <> <<(P|P|...|P) Alternation.>> ParseZ: PROC [] RETURNS [p: ParseTree _ NIL] = { c: CHAR _ GetToken[]; SELECT c FROM beginAltToken => { l: LIST OF ParseTree _ NIL; DO q: ParseTree _ ParseP[]; IF q = NIL THEN q _ NEW[ParseTreeContent.noOp]; l _ CONS[q, l]; c _ GetToken[]; SELECT c FROM endAltToken => EXIT; altSepToken => {}; ENDCASE => SyntaxError[improperAltSeparator]; ENDLOOP; SELECT TRUE FROM l = NIL => RETURN [NIL]; l.rest = NIL => RETURN [l.first]; ENDCASE => TRUSTED { RETURN[NEW[ParseTreeContent.alt _ [alt[LOOPHOLE[List.Reverse[LOOPHOLE[l]]]]]]]; }; }; endAltToken, altSepToken => { UnReadToken[]; RETURN[NIL] } ENDCASE => { UnReadToken[]; p _ ParseX[]; }; }; <<>> <> <

> <> <> <> <> <> <> <<~ZP Deterministically match anything up to but not including P.>> ParseP: PROC [] RETURNS [ParseTree] = { l: LIST OF ParseTree _ NIL; p: ParseTree; DO c: CHAR _ GetToken[]; IF c = notToken THEN p _ NEW[ParseTreeContent.skipTo _ [skipTo[ParseZ[]]]] ELSE { UnReadToken[]; p _ ParseZ[]; IF p = NIL THEN EXIT; c _ GetToken[]; SELECT c FROM closureToken => p _ NEW[ParseTreeContent.closure _ [closure[p]]]; greedyClosureToken => p _ NEW[ParseTreeContent.greedyClosure _ [greedyClosure[p]]]; plusToken => p _ NEW[ParseTreeContent.concat _ [concat[LIST[p, NEW[ParseTreeContent.closure _ [closure[p]]]]]]]; greedyPlusToken => p _ NEW[ParseTreeContent.concat _ [concat[LIST[p, NEW[ParseTreeContent.greedyClosure _ [greedyClosure[p]]]]]]]; powerToken => { l: LIST OF ParseTree _ NIL; iterations: Index _ ParseNumber[FALSE]; WHILE iterations > 0 DO l _ CONS[p, l]; iterations _ iterations - 1; ENDLOOP; p _ NEW[ParseTreeContent.concat _ [concat[l]]]; }; ENDCASE => UnReadToken[]; }; IF p.type = concat THEN { pp: REF ParseTreeContent.concat _ NARROW[p]; FOR ll: LIST OF ParseTree _ pp.concats, ll.rest UNTIL ll = NIL DO l _ CONS[ll.first, l]; ENDLOOP; } ELSE l _ CONS[p, l]; IF l.first.type = closure THEN { q: REF ParseTreeContent.closure _ NARROW[l.first]; IF q.p.type = closure OR q.p.type = greedyClosure THEN l.first _ q.p; } ELSE IF l.first.type = greedyClosure THEN { q: REF ParseTreeContent.greedyClosure _ NARROW[l.first]; IF q.p.type = closure THEN { qq: REF ParseTreeContent.closure _ NARROW[q.p]; q.p _ qq.p; } ELSE IF q.p.type = greedyClosure THEN l.first _ q.p; }; ENDLOOP; SELECT TRUE FROM l = NIL => RETURN[NIL]; l.rest = NIL => RETURN[l.first]; ENDCASE => TRUSTED { RETURN[NEW[ParseTreeContent.concat _ [concat[LOOPHOLE[List.Reverse[LOOPHOLE[l]]]]]]]; }; }; <> <> <> <<T Named portions. Valid only at top level. Reserved name ALL.>> <<T Bound the number of CR's matched by P to number.>> <> <

> <> <> <> <.>> ParseTopLevel: PROC [] RETURNS [l: LIST OF ParseTree _ NIL, nameList: SimpleSymbolTable _ NIL, numberFields: Index _ 0] = { seenBeginAllToken, seenEndAllToken: BOOL _ FALSE; DO c: CHAR _ GetToken[]; SELECT c FROM beginFieldToken => { field: ParseTree _ NIL; [field, nameList, numberFields] _ ParseField[nameList, numberFields]; l _ CONS[field, l]; }; endFieldToken => SyntaxError[notInsideField]; beginAllToken => { IF seenBeginAllToken THEN SyntaxError[moreThanOneBeginAll]; seenBeginAllToken _ TRUE; l _ CONS[NEW[ParseTreeContent.beginALL], l]; }; endAllToken => { IF ~seenBeginAllToken OR seenEndAllToken THEN SyntaxError[noMatchingBeginAll]; seenEndAllToken _ TRUE; l _ CONS[NEW[ParseTreeContent.endALL], l]; }; endPatternToken => { IF ~seenBeginAllToken THEN l _ CONS[NEW[ParseTreeContent.endALL], l]; IF seenBeginAllToken AND ~seenEndAllToken THEN SyntaxError[noClosingEndAll]; l _ CONS[NEW[ParseTreeContent.endAll], l]; EXIT; }; ENDCASE => { p: ParseTree; UnReadToken[]; p _ ParseP[]; IF p = NIL THEN SyntaxError[illegalCharacter]; WITH p SELECT FROM z: REF ParseTreeContent.concat => TRUSTED { l _ LOOPHOLE[List.Nconc[List.Reverse[LOOPHOLE[z.concats]], LOOPHOLE[l]]]; }; ENDCASE => l _ CONS[p, l]; }; ENDLOOP; TRUSTED {l _ LOOPHOLE[List.Reverse[LOOPHOLE[l]]]}; IF ~seenBeginAllToken THEN l _ CONS[NEW[ParseTreeContent.beginALL], l]; l _ CONS[NEW[ParseTreeContent.beginAll], l]; }; ParseField: PROC [names: SimpleSymbolTable, number: Index] RETURNS [field: ParseTree, newNames: SimpleSymbolTable, newNumber: Index] = { <. The name is a sequence of alphabetic characters; the bound is a non-negative integer, and the pattern is a P. The bound and pattern are optional. By default, there is no bound and the pattern is #* with whatever looks the name began with. >> s: ParseTree _ NIL; name: ROPE _ NIL; c: CHAR _ Ascii.Lower[GetToken[]]; nameLooks: TextLooks.Looks _ looksRead; bound: INT _ -1; newNames _ names; DO IF ~(c IN ['a..'z]) THEN EXIT; name _ Rope.Concat[name, Rope.FromChar[c]]; c _ Ascii.Lower[GetToken[]]; ENDLOOP; IF name = NIL THEN SyntaxError[nameMustBeAString]; IF Rope.Equal[name, "all", FALSE] THEN SyntaxError[theAllNameIsReserved]; FOR l: SimpleSymbolTable _ names, l.rest UNTIL l = NIL DO IF Rope.Equal[name, l.first.name, FALSE] THEN { IF c # endFieldToken THEN SyntaxError[secondOccurenceOfFieldMustNotContainPattern]; IF ignoreCase THEN field _ NEW[ParseTreeContent.fieldEqualsIC _ [fieldEqualsIC[l.first.number]]] ELSE field _ NEW[ParseTreeContent.fieldEquals _ [fieldEquals[l.first.number]]]; RETURN[field, names, number]; }; ENDLOOP; newNumber _ number + 1; newNames _ CONS[NEW[SimpleSymbolTableEntry _ [name, newNumber]], newNames]; IF c = boundSepToken THEN bound _ ParseNumber[FALSE]; SELECT c FROM fieldSepToken => { s _ ParseP[]; IF GetToken[] # endFieldToken THEN SyntaxError[expectedEndOfField]; }; endFieldToken => { s _ NEW[ParseTreeContent.closure _ [closure[NEW[ParseTreeContent.anyChar _ [anyChar[nameLooks]]]]]]; }; ENDCASE => SyntaxError[expectedEndOfField]; field _ NEW[ParseTreeContent.field _ [field[newNumber, bound, s]]]; }; ParseNumber: PROC [octal: BOOL] RETURNS [number: Index _ 0] = { c: CHAR; IF octal THEN WHILE (c _ GetToken[]) IN ['0..'7] DO number _ 8*number + c - '0; ENDLOOP ELSE WHILE (c _ GetToken[]) IN ['0..'9] DO number _ 10*number + c - '0; ENDLOOP; IF c # '. THEN UnReadToken[]; }; IF addBounds THEN patternRope _ Rope.Concat["^", Rope.Concat[patternRope, "$"]]; pEnd _ MIN[Rope.Size[patternRope], patternStart+patternLen]; patternStart _ MIN[patternStart,pEnd]; pPos _ patternStart; finder _ NEW[FinderRec]; IF word THEN finder.wordSearch _ TRUE; <> <<>> finder.ropeReader _ RopeReader.Create[]; RopeReader.SetPosition[finder.ropeReader, patternRope, patternStart]; IF patternRuns # NIL AND ~ignoreLooks THEN { finder.lksReader _ LooksReader.Create[]; LooksReader.SetPosition[finder.lksReader, patternRuns, patternStart] } ELSE finder.lksReader _ NIL; [parsedPatternList, nameList, numberOfFields] _ ParseTopLevel[]; finder.nameArray _ NEW[NameArray[numberOfFields+1]]; FOR l: SimpleSymbolTable _ nameList, l.rest UNTIL l = NIL DO finder.nameArray[l.first.number].name _ l.first.name; ENDLOOP; forwardPattern _ NEW[ParseTreeContent.concat _ [concat[CONS[NEW[ParseTreeContent.closure _ [closure[NEW[ParseTreeContent.anyChar _ [anyChar[IgnoreLooks]]]]]], parsedPatternList]]]]; [forwardPattern, charClassList, numCharClasses] _ RegularExpression.OptimizeForwardSearch[forwardPattern, charClassList, numCharClasses]; TRUSTED { backwardPattern _ NEW[ParseTreeContent.concat _ [concat[LOOPHOLE[List.Append[ LOOPHOLE[parsedPatternList], LOOPHOLE[LIST[NEW[ParseTreeContent.closure _ [closure[NEW[ParseTreeContent.anyChar _ [anyChar[IgnoreLooks]]]]]]]]]]]]]; }; [backwardPattern, charClassList, numCharClasses] _ RegularExpression.OptimizeBackwardSearch[backwardPattern, charClassList, numCharClasses]; finder.classes _ NEW[ClassArray[numCharClasses]]; FOR l: LIST OF ParseTree _ charClassList, l.rest UNTIL l = NIL DO WITH l.first SELECT FROM x: REF ParseTreeContent.class => finder.classes[x.classNumber] _ x.class; x: REF ParseTreeContent.skipOverClass => finder.classes[x.classNumber] _ x.class; ENDCASE => ERROR; ENDLOOP; finder.stack _ NEW[StackContent _ [0, NEW[PatternStackArray[100]], NEW[TextStackArray[100]], NEW[ReturnCodeArray[100]]]]; -- Interim hack. finder.forwardProgram _ RegularExpression.Compile[forwardPattern]; finder.backwardProgram _ RegularExpression.Compile[backwardPattern]; finder.wordSearch _ word; }; }.