<> <> <> <> DIRECTORY Ascii USING [Lower, Upper], Basics USING [BITAND, BITOR, BITNOT], List USING [Append, DReverse, Reverse], TextLooks USING [Looks, noLooks, allLooks], TextLooksSupport USING [LooksOR], RegularExpression USING [CharClass, CharClassContent, Index, LegalInputCharacters, ParseTree, ParseTreeContent, ParseTypes]; RegExpFindOptimizeImpl: CEDAR MONITOR IMPORTS Ascii, List, Basics, TextLooksSupport EXPORTS RegularExpression = { OPEN RegularExpression; emptyCharClass: CharClassContent = ALL[FALSE]; allCharClass: CharClassContent = [FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE]; charClassList: LIST OF ParseTree _ NIL; numCharClasses: Index; NodeBreakType: TYPE = {beginning, end, both, none}; OptimizeForwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST OF ParseTree, newNumCcl: Index] = { ENABLE UNWIND => NULL; follow: CharClass _ NEW[CharClassContent _ allCharClass]; charClassList _ ccl; numCharClasses _ numCcl; np _ Optimize[TRUE, p, follow, ALL[TRUE], end].np; newCcl _ charClassList; charClassList _ NIL; newNumCcl _ numCharClasses; }; OptimizeBackwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST OF ParseTree, newNumCcl: Index] = { ENABLE UNWIND => NULL; follow: CharClass _ NEW[CharClassContent _ allCharClass]; charClassList _ ccl; numCharClasses _ numCcl; np _ Optimize[FALSE, p, follow, ALL[TRUE], beginning].np; newCcl _ charClassList; charClassList _ NIL; newNumCcl _ numCharClasses; }; Optimize: PROC [forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = { np _ p; matchesNull _ FALSE; WITH p SELECT FROM x: REF ParseTreeContent.concat => { l: LIST OF ParseTree; [l, looksStart, nodeBreakStart, matchesNull] _ IF forwards THEN OptimizeConcatForwards[x.concats, follow, looksFollow, nodeBreak] ELSE OptimizeConcatBackwards[x.concats, follow, looksFollow, nodeBreak]; np _ NEW[ParseTreeContent.concat _ [concat[l]]]; }; x: REF ParseTreeContent.alt => { ll: LIST OF ParseTree; xp: ParseTree; xStart: CharClass _ NEW[CharClassContent]; start: CharClass _ NEW[CharClassContent _ emptyCharClass]; xLooksStart: TextLooks.Looks; xNodeBreakStart: NodeBreakType; start^ _ follow^; [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.alts.first, start, looksFollow, nodeBreak]; ll _ CONS[xp, NIL]; FOR l: LIST OF ParseTree _ x.alts.rest, l.rest UNTIL l = NIL DO oneAltMatchesNull: BOOL _ FALSE; xStart^ _ follow^; [xp, xLooksStart, xNodeBreakStart, oneAltMatchesNull] _ Optimize[forwards, l.first, xStart, looksFollow, nodeBreak]; matchesNull _ matchesNull OR oneAltMatchesNull; ll _ CONS[xp, ll]; CharClassUnion[start, xStart]; looksStart _ TextLooksSupport.LooksOR[looksStart, xLooksStart]; IF nodeBreakStart = none THEN nodeBreakStart _ xNodeBreakStart ELSE IF xNodeBreakStart # none AND nodeBreakStart # xNodeBreakStart THEN nodeBreakStart _ both; ENDLOOP; follow^ _ start^; TRUSTED {ll _ LOOPHOLE[List.DReverse[LOOPHOLE[ll]]]}; np _ NEW[ParseTreeContent.alt _ [alt[ll]]]; }; x: REF ParseTreeContent.field => { xp: ParseTree; [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, follow, looksFollow, nodeBreak]; np _ NEW[ParseTreeContent.field _ [field[x.number, x.bound, xp]]]; }; x: REF ParseTreeContent.char => { follow^ _ ALL[FALSE]; follow[x.char] _ TRUE; looksStart _ x.looks; nodeBreakStart _ none; }; x: REF ParseTreeContent.charIC => { follow^ _ ALL[FALSE]; follow[x.char] _ TRUE; follow[Ascii.Lower[x.char]] _ TRUE; looksStart _ x.looks; nodeBreakStart _ none; }; x: REF ParseTreeContent.class => { follow^ _ x.class^; looksStart _ x.looks; nodeBreakStart _ none; }; x: REF ParseTreeContent.anyChar => { follow^ _ allCharClass; looksStart _ x.looks; nodeBreakStart _ none; }; x: REF ParseTreeContent.nodeBreak => { follow^ _ ALL[FALSE]; looksStart _ TextLooks.noLooks; nodeBreakStart _ end; matchesNull _ TRUE; }; x: REF ParseTreeContent.beginNode => { follow^ _ ALL[FALSE]; looksStart _ TextLooks.noLooks; nodeBreakStart _ beginning; matchesNull _ TRUE; }; x: REF ParseTreeContent.succeed => { follow^ _ allCharClass; looksStart _ TextLooks.allLooks; nodeBreakStart _ none; }; x: REF ParseTreeContent.skipToChar => { follow^ _ allCharClass; looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow]; nodeBreakStart _ nodeBreak; matchesNull _ TRUE; }; x: REF ParseTreeContent.skipOverChar => { follow[x.char] _ TRUE; looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow]; nodeBreakStart _ nodeBreak; matchesNull _ TRUE; }; x: REF ParseTreeContent.skipToCharIC => { follow^ _ allCharClass; looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow]; nodeBreakStart _ nodeBreak; matchesNull _ TRUE; }; x: REF ParseTreeContent.skipOverCharIC => { follow[x.char] _ TRUE; follow[Ascii.Lower[x.char]] _ TRUE; looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow]; nodeBreakStart _ nodeBreak; matchesNull _ TRUE; }; x: REF ParseTreeContent.skipTo => { xp: ParseTree; [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, follow, looksFollow, nodeBreak]; np _ OptimizeSkipTo[NEW[ParseTreeContent.skipTo _ [skipTo[xp]]]]; follow^ _ allCharClass; looksStart _ TextLooks.allLooks; nodeBreakStart _ both; matchesNull _ TRUE; }; x: REF ParseTreeContent.skipOver => { xp: ParseTree; start: CharClass _ NEW[CharClassContent _ follow^]; [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, start, looksFollow, nodeBreak]; np _ NEW[ParseTreeContent.skipOver _ [skipOver[xp]]]; CharClassUnion[follow, start]; looksStart _ TextLooksSupport.LooksOR[looksStart, looksFollow]; IF nodeBreakStart = none THEN nodeBreakStart _ nodeBreak ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN nodeBreakStart _ both; matchesNull _ TRUE; }; x: REF ParseTreeContent.closure => [np, looksStart, nodeBreakStart, matchesNull] _ OptimizeClosure[closure, forwards, x.p, follow, looksFollow, nodeBreak]; x: REF ParseTreeContent.greedyClosure => [np, looksStart, nodeBreakStart, matchesNull] _ OptimizeClosure[greedyClosure, forwards, x.p, follow, looksFollow, nodeBreak]; ENDCASE => { SELECT p.type FROM fieldEquals, fieldEqualsIC, fail => { follow^ _ allCharClass; looksStart _ TextLooks.allLooks; nodeBreakStart _ both; matchesNull _ TRUE; }; noOp, beginAll, endAll, beginALL, endALL => { <> looksStart _ looksFollow; nodeBreakStart _ nodeBreak; matchesNull _ TRUE; }; ENDCASE => ERROR; }; }; OptimizeConcatForwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED { [lp, looksStart, nodeBreakStart, matchesNull] _ OptimizeConcat[TRUE, LOOPHOLE[List.Reverse[LOOPHOLE[l]]], follow, looksFollow, nodeBreak]; }; OptimizeConcatBackwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED { [lp, looksStart, nodeBreakStart, matchesNull] _ OptimizeConcat[FALSE, LOOPHOLE[List.Append[LOOPHOLE[l]]], follow, looksFollow, nodeBreak]; }; OptimizeConcat: PROC [forwards: BOOL, l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL _ TRUE] = { looksStart _ looksFollow; nodeBreakStart _ nodeBreak; lp _ NIL; WHILE l # NIL DO first: LIST OF ParseTree _ l; thisOneMatchesNull: BOOL; l _ l.rest; first.rest _ lp; lp _ first; [lp.first, looksStart, nodeBreakStart, thisOneMatchesNull] _ Optimize[forwards, lp.first, follow, looksStart, nodeBreakStart]; matchesNull _ matchesNull AND thisOneMatchesNull; ENDLOOP; }; OptimizeClosure: PROC [kind: ParseTypes, forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL _ TRUE] = { Make: PROC[p: ParseTree] RETURNS [cl: ParseTree] = { SELECT kind FROM closure => cl _ NEW[ParseTreeContent.closure _ [closure[p]]]; greedyClosure => cl _ NEW[ParseTreeContent.greedyClosure _ [greedyClosure[p]]]; ENDCASE => ERROR; }; MakeCareful: PROC[p: ParseTree] RETURNS [cl: ParseTree] = { SELECT kind FROM closure => cl _ NEW[ParseTreeContent.carefulClosure _ [carefulClosure[p]]]; greedyClosure => cl _ NEW[ParseTreeContent.carefulGreedyClosure _ [carefulGreedyClosure[p]]]; ENDCASE => ERROR; }; WITH p SELECT FROM z: REF ParseTreeContent.anyChar => { IF follow^ = emptyCharClass THEN { IF (nodeBreak = beginning AND forwards) OR (nodeBreak = end AND ~forwards) THEN np _ NEW[ParseTreeContent.noOp] ELSE IF (nodeBreak = end AND forwards) OR (nodeBreak = beginning AND ~forwards) THEN np _ NEW[ParseTreeContent.skipToNodeBreak _ [skipToNodeBreak[z.looks]]] ELSE np _ Make[p]; } ELSE IF follow^ # allCharClass THEN { start: CharClass _ NEW[CharClassContent _ allCharClass]; rest: CharClass _ NEW[CharClassContent _ follow^]; q: ParseTree; CharClassDifference[start, rest]; q _ SkipOverClassNode[start, z.looks]; np _ NEW[ParseTreeContent.concat _ [concat[LIST[q, Make[NEW[ParseTreeContent.concat _ [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]]; } ELSE np _ Make[p]; follow^ _ allCharClass; looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow]; nodeBreakStart _ nodeBreak; }; z: REF ParseTreeContent.char => { IF ~follow[z.char] THEN np _ NEW[ParseTreeContent.skipOverChar _ [skipOverChar[z.char, z.looks]]] ELSE np _ Make[p]; follow[z.char] _ TRUE; looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow]; nodeBreakStart _ nodeBreak; }; z: REF ParseTreeContent.charIC => { IF ~follow[z.char] AND ~follow[Ascii.Lower[z.char]] THEN np _ NEW[ParseTreeContent.skipOverCharIC _ [skipOverCharIC[z.char, z.looks]]] ELSE np _ Make[p]; follow[z.char] _ TRUE; follow[Ascii.Lower[z.char]] _ TRUE; looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow]; nodeBreakStart _ nodeBreak; }; z: REF ParseTreeContent.class => { IF follow^ = emptyCharClass THEN np _ NEW[ParseTreeContent.skipOverClass _ [skipOverClass[z.class, z.looks, z.classNumber]]] ELSE { thisClass: CharClass _ NEW[CharClassContent _ z.class^]; rest: CharClass _ NEW[CharClassContent _ follow^]; CharClassIntersection[rest, thisClass]; CharClassDifference[thisClass, rest]; IF rest^ = emptyCharClass THEN np _ SkipOverClassNode[thisClass, z.looks] ELSE IF thisClass^ # emptyCharClass THEN { q: ParseTree _ SkipOverClassNode[thisClass, z.looks]; np _ NEW[ParseTreeContent.concat _ [concat[LIST[q, Make[NEW[ParseTreeContent.concat _ [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]]; } ELSE np _ Make[p]; CharClassUnion[follow, thisClass]; }; CharClassUnion[follow, z.class]; looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow]; nodeBreakStart _ nodeBreak; }; ENDCASE => { xp: ParseTree; thisOneMatchesNull: BOOL; [xp, looksStart, nodeBreakStart, thisOneMatchesNull] _ Optimize[forwards, p, follow, looksFollow, nodeBreak]; IF thisOneMatchesNull THEN np _ MakeCareful[xp] ELSE np _ Make[xp]; looksStart _ TextLooksSupport.LooksOR[looksStart, looksFollow]; IF nodeBreakStart = none THEN nodeBreakStart _ nodeBreak ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN nodeBreakStart _ both; }; }; OptimizeSkipTo: PROC [x: REF ParseTreeContent.skipTo] RETURNS [ParseTree] = { IF x.p.type = concat THEN { q: REF ParseTreeContent.concat _ NARROW[x.p]; FOR l: LIST OF ParseTree _ q.concats, l.rest UNTIL l = NIL DO SELECT l.first.type FROM char, charIC, class, anyChar => NULL; ENDCASE => RETURN[x]; ENDLOOP; RETURN[NEW[ParseTreeContent.skipToString _ [skipToString[x.p]]]]; }; RETURN[x]; }; ClassType: TYPE = {wholeSet, char, charIC, notChar, notCharIC}; AnalyzeClass: PROC [x: CharClass] RETURNS [kind: ClassType, c: CHAR] = { classPopulation: NAT = LegalInputCharacters.LAST - LegalInputCharacters.FIRST + 1; lastC, lastNotC: CHAR; population: NAT _ 0; c _ 'a; FOR chars: CHAR IN LegalInputCharacters DO IF x[chars] THEN { population _ population + 1; lastC _ chars; } ELSE lastNotC _ chars; ENDLOOP; SELECT population FROM 0 => ERROR; 1 => { kind _ char; c _ lastC; }; 2 => { IF lastC IN ['a..'z] AND x[Ascii.Upper[lastC]] THEN { kind _ charIC; c _ Ascii.Upper[lastC]; } ELSE kind _ wholeSet; }; classPopulation-2 => { IF lastNotC IN ['a..'z] AND ~x[Ascii.Upper[lastNotC]] THEN { kind _ notCharIC; c _ Ascii.Upper[lastNotC]; } ELSE kind _ wholeSet; }; classPopulation-1 => { kind _ notChar; c _ lastNotC; }; classPopulation => ERROR; ENDCASE => kind _ wholeSet; }; SkipOverClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = { classType: ClassType; c: CHAR; [classType, c] _ AnalyzeClass[class]; SELECT classType FROM wholeSet => { p _ NEW[ParseTreeContent.skipOverClass _ [skipOverClass[NEW[CharClassContent _ class^], looks, numCharClasses]]]; charClassList _ CONS[p, charClassList]; numCharClasses _ numCharClasses + 1; }; char => p _ NEW[ParseTreeContent.skipOverChar _ [skipOverChar[char: c, looks: looks]]]; charIC => p _ NEW[ParseTreeContent.skipOverCharIC _ [skipOverCharIC[char: c, looks: looks]]]; notChar => p _ NEW[ParseTreeContent.skipToChar _ [skipToChar[char: c, looks: looks]]]; notCharIC => p _ NEW[ParseTreeContent.skipToCharIC _ [skipToCharIC[char: c, looks: looks]]]; ENDCASE => ERROR; }; MatchClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = { classType: ClassType; c: CHAR; [classType, c] _ AnalyzeClass[class]; SELECT classType FROM wholeSet, notChar, notCharIC => { p _ NEW[ParseTreeContent.class _ [class[NEW[CharClassContent _ class^], looks, numCharClasses]]]; charClassList _ CONS[p, charClassList]; numCharClasses _ numCharClasses + 1; }; char => p _ NEW[ParseTreeContent.char _ [char[c, looks]]]; charIC => p _ NEW[ParseTreeContent.charIC _ [charIC[c, looks]]]; ENDCASE => ERROR; }; PseudoSet: TYPE = REF ARRAY [0..8) OF CARDINAL; CharClassUnion: PROC[c1, c2: CharClass] = TRUSTED { ps1: PseudoSet _ LOOPHOLE[c1]; ps2: PseudoSet _ LOOPHOLE[c2]; FOR i: CARDINAL IN [0..8) DO ps1[i] _ Basics.BITOR[ps1[i], ps2[i]]; ENDLOOP; }; CharClassIntersection: PROC[c1, c2: CharClass] = TRUSTED { ps1: PseudoSet _ LOOPHOLE[c1]; ps2: PseudoSet _ LOOPHOLE[c2]; FOR i: CARDINAL IN [0..8) DO ps1[i] _ Basics.BITAND[ps1[i], ps2[i]]; ENDLOOP; }; CharClassDifference: PROC[c1, c2: CharClass] = TRUSTED { ps1: PseudoSet _ LOOPHOLE[c1]; ps2: PseudoSet _ LOOPHOLE[c2]; FOR i: CARDINAL IN [0..8) DO ps1[i] _ Basics.BITAND[ps1[i], Basics.BITNOT[ps2[i]]]; ENDLOOP; c1[0C] _ FALSE; }; }.