RegExpFindOptimizeImpl.mesa
Copyright © 1985 by Xerox Corporation. All rights reserved.
Nix, December 21, 1983 5:27 pm
Russ Atkinson (RRA) March 11, 1985 7:43:02 pm PST
DIRECTORY
Ascii USING [Lower, Upper],
Basics USING [BITAND, BITOR, BITNOT],
List USING [Append, DReverse, Reverse],
TextLooks USING [Looks, noLooks, allLooks],
TextLooksSupport USING [LooksOR],
RegularExpression USING [CharClass, CharClassContent, Index, LegalInputCharacters, ParseTree, ParseTreeContent, ParseTypes];
RegExpFindOptimizeImpl: CEDAR MONITOR
IMPORTS Ascii, List, Basics, TextLooksSupport
EXPORTS RegularExpression = {
OPEN RegularExpression;
emptyCharClass: CharClassContent = ALL[FALSE];
allCharClass: CharClassContent = [FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE];
charClassList: LIST OF ParseTree ← NIL;
numCharClasses: Index;
NodeBreakType: TYPE = {beginning, end, both, none};
OptimizeForwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST OF ParseTree, newNumCcl: Index] = {
ENABLE UNWIND => NULL;
follow: CharClass ← NEW[CharClassContent ← allCharClass];
charClassList ← ccl;
numCharClasses ← numCcl;
np ← Optimize[TRUE, p, follow, ALL[TRUE], end].np;
newCcl ← charClassList;
charClassList ← NIL;
newNumCcl ← numCharClasses;
};
OptimizeBackwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST OF ParseTree, newNumCcl: Index] = {
ENABLE UNWIND => NULL;
follow: CharClass ← NEW[CharClassContent ← allCharClass];
charClassList ← ccl;
numCharClasses ← numCcl;
np ← Optimize[FALSE, p, follow, ALL[TRUE], beginning].np;
newCcl ← charClassList;
charClassList ← NIL;
newNumCcl ← numCharClasses;
};
Optimize: PROC [forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = {
np ← p;
matchesNull ← FALSE;
WITH p SELECT FROM
x: REF ParseTreeContent.concat => {
l: LIST OF ParseTree;
[l, looksStart, nodeBreakStart, matchesNull] ←
IF forwards THEN OptimizeConcatForwards[x.concats, follow, looksFollow, nodeBreak]
ELSE OptimizeConcatBackwards[x.concats, follow, looksFollow, nodeBreak];
np ← NEW[ParseTreeContent.concat ← [concat[l]]];
};
x: REF ParseTreeContent.alt => {
ll: LIST OF ParseTree;
xp: ParseTree;
xStart: CharClass ← NEW[CharClassContent];
start: CharClass ← NEW[CharClassContent ← emptyCharClass];
xLooksStart: TextLooks.Looks;
xNodeBreakStart: NodeBreakType;
start^ ← follow^;
[xp, looksStart, nodeBreakStart, matchesNull] ← Optimize[forwards, x.alts.first, start, looksFollow, nodeBreak];
ll ← CONS[xp, NIL];
FOR l: LIST OF ParseTree ← x.alts.rest, l.rest UNTIL l = NIL DO
oneAltMatchesNull: BOOLFALSE;
xStart^ ← follow^;
[xp, xLooksStart, xNodeBreakStart, oneAltMatchesNull] ← Optimize[forwards, l.first, xStart, looksFollow, nodeBreak];
matchesNull ← matchesNull OR oneAltMatchesNull;
ll ← CONS[xp, ll];
CharClassUnion[start, xStart];
looksStart ← TextLooksSupport.LooksOR[looksStart, xLooksStart];
IF nodeBreakStart = none THEN
nodeBreakStart ← xNodeBreakStart
ELSE IF xNodeBreakStart # none AND nodeBreakStart # xNodeBreakStart THEN
nodeBreakStart ← both;
ENDLOOP;
follow^ ← start^;
TRUSTED {ll ← LOOPHOLE[List.DReverse[LOOPHOLE[ll]]]};
np ← NEW[ParseTreeContent.alt ← [alt[ll]]];
};
x: REF ParseTreeContent.field => {
xp: ParseTree;
[xp, looksStart, nodeBreakStart, matchesNull] ← Optimize[forwards, x.p, follow, looksFollow, nodeBreak];
np ← NEW[ParseTreeContent.field ← [field[x.number, x.bound, xp]]];
};
x: REF ParseTreeContent.char => {
follow^ ← ALL[FALSE];
follow[x.char] ← TRUE;
looksStart ← x.looks;
nodeBreakStart ← none;
};
x: REF ParseTreeContent.charIC => {
follow^ ← ALL[FALSE];
follow[x.char] ← TRUE;
follow[Ascii.Lower[x.char]] ← TRUE;
looksStart ← x.looks;
nodeBreakStart ← none;
};
x: REF ParseTreeContent.class => {
follow^ ← x.class^;
looksStart ← x.looks;
nodeBreakStart ← none;
};
x: REF ParseTreeContent.anyChar => {
follow^ ← allCharClass;
looksStart ← x.looks;
nodeBreakStart ← none;
};
x: REF ParseTreeContent.nodeBreak => {
follow^ ← ALL[FALSE];
looksStart ← TextLooks.noLooks;
nodeBreakStart ← end;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.beginNode => {
follow^ ← ALL[FALSE];
looksStart ← TextLooks.noLooks;
nodeBreakStart ← beginning;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.succeed => {
follow^ ← allCharClass;
looksStart ← TextLooks.allLooks;
nodeBreakStart ← none;
};
x: REF ParseTreeContent.skipToChar => {
follow^ ← allCharClass;
looksStart ← TextLooksSupport.LooksOR[x.looks, looksFollow];
nodeBreakStart ← nodeBreak;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.skipOverChar => {
follow[x.char] ← TRUE;
looksStart ← TextLooksSupport.LooksOR[x.looks, looksFollow];
nodeBreakStart ← nodeBreak;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.skipToCharIC => {
follow^ ← allCharClass;
looksStart ← TextLooksSupport.LooksOR[x.looks, looksFollow];
nodeBreakStart ← nodeBreak;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.skipOverCharIC => {
follow[x.char] ← TRUE;
follow[Ascii.Lower[x.char]] ← TRUE;
looksStart ← TextLooksSupport.LooksOR[x.looks, looksFollow];
nodeBreakStart ← nodeBreak;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.skipTo => {
xp: ParseTree;
[xp, looksStart, nodeBreakStart, matchesNull] ← Optimize[forwards, x.p, follow, looksFollow, nodeBreak];
np ← OptimizeSkipTo[NEW[ParseTreeContent.skipTo ← [skipTo[xp]]]];
follow^ ← allCharClass;
looksStart ← TextLooks.allLooks;
nodeBreakStart ← both;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.skipOver => {
xp: ParseTree;
start: CharClass ← NEW[CharClassContent ← follow^];
[xp, looksStart, nodeBreakStart, matchesNull] ← Optimize[forwards, x.p, start, looksFollow, nodeBreak];
np ← NEW[ParseTreeContent.skipOver ← [skipOver[xp]]];
CharClassUnion[follow, start];
looksStart ← TextLooksSupport.LooksOR[looksStart, looksFollow];
IF nodeBreakStart = none THEN
nodeBreakStart ← nodeBreak
ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN
nodeBreakStart ← both;
matchesNull ← TRUE;
};
x: REF ParseTreeContent.closure =>
[np, looksStart, nodeBreakStart, matchesNull] ←
OptimizeClosure[closure, forwards, x.p, follow, looksFollow, nodeBreak];
x: REF ParseTreeContent.greedyClosure =>
[np, looksStart, nodeBreakStart, matchesNull] ←
OptimizeClosure[greedyClosure, forwards, x.p, follow, looksFollow, nodeBreak];
ENDCASE => {
SELECT p.type FROM
fieldEquals, fieldEqualsIC, fail => {
follow^ ← allCharClass;
looksStart ← TextLooks.allLooks;
nodeBreakStart ← both;
matchesNull ← TRUE;
};
noOp, beginAll, endAll, beginALL, endALL => {
preserve follow^
looksStart ← looksFollow;
nodeBreakStart ← nodeBreak;
matchesNull ← TRUE;
};
ENDCASE => ERROR;
};
};
OptimizeConcatForwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED {
[lp, looksStart, nodeBreakStart, matchesNull] ← OptimizeConcat[TRUE, LOOPHOLE[List.Reverse[LOOPHOLE[l]]], follow, looksFollow, nodeBreak];
};
OptimizeConcatBackwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED {
[lp, looksStart, nodeBreakStart, matchesNull] ← OptimizeConcat[FALSE, LOOPHOLE[List.Append[LOOPHOLE[l]]], follow, looksFollow, nodeBreak];
};
OptimizeConcat: PROC [forwards: BOOL, l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOLTRUE] = {
looksStart ← looksFollow;
nodeBreakStart ← nodeBreak;
lp ← NIL;
WHILE l # NIL DO
first: LIST OF ParseTree ← l;
thisOneMatchesNull: BOOL;
l ← l.rest;
first.rest ← lp;
lp ← first;
[lp.first, looksStart, nodeBreakStart, thisOneMatchesNull] ← Optimize[forwards, lp.first, follow, looksStart, nodeBreakStart];
matchesNull ← matchesNull AND thisOneMatchesNull;
ENDLOOP;
};
OptimizeClosure: PROC [kind: ParseTypes, forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOLTRUE] = {
Make: PROC[p: ParseTree] RETURNS [cl: ParseTree] = {
SELECT kind FROM
closure => cl ← NEW[ParseTreeContent.closure ← [closure[p]]];
greedyClosure => cl ← NEW[ParseTreeContent.greedyClosure ← [greedyClosure[p]]];
ENDCASE => ERROR;
};
MakeCareful: PROC[p: ParseTree] RETURNS [cl: ParseTree] = {
SELECT kind FROM
closure => cl ← NEW[ParseTreeContent.carefulClosure ← [carefulClosure[p]]];
greedyClosure => cl ← NEW[ParseTreeContent.carefulGreedyClosure ← [carefulGreedyClosure[p]]];
ENDCASE => ERROR;
};
WITH p SELECT FROM
z: REF ParseTreeContent.anyChar => {
IF follow^ = emptyCharClass THEN {
IF (nodeBreak = beginning AND forwards) OR (nodeBreak = end AND ~forwards) THEN
np ← NEW[ParseTreeContent.noOp]
ELSE IF (nodeBreak = end AND forwards) OR (nodeBreak = beginning AND ~forwards) THEN
np ← NEW[ParseTreeContent.skipToNodeBreak ← [skipToNodeBreak[z.looks]]]
ELSE
np ← Make[p];
}
ELSE IF follow^ # allCharClass THEN {
start: CharClass ← NEW[CharClassContent ← allCharClass];
rest: CharClass ← NEW[CharClassContent ← follow^];
q: ParseTree;
CharClassDifference[start, rest];
q ← SkipOverClassNode[start, z.looks];
np ← NEW[ParseTreeContent.concat ←
[concat[LIST[q, Make[NEW[ParseTreeContent.concat ← [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]];
}
ELSE
np ← Make[p];
follow^ ← allCharClass;
looksStart ← TextLooksSupport.LooksOR[z.looks, looksFollow];
nodeBreakStart ← nodeBreak;
};
z: REF ParseTreeContent.char => {
IF ~follow[z.char] THEN
np ← NEW[ParseTreeContent.skipOverChar ← [skipOverChar[z.char, z.looks]]]
ELSE
np ← Make[p];
follow[z.char] ← TRUE;
looksStart ← TextLooksSupport.LooksOR[z.looks, looksFollow];
nodeBreakStart ← nodeBreak;
};
z: REF ParseTreeContent.charIC => {
IF ~follow[z.char] AND ~follow[Ascii.Lower[z.char]] THEN
np ← NEW[ParseTreeContent.skipOverCharIC ← [skipOverCharIC[z.char, z.looks]]]
ELSE
np ← Make[p];
follow[z.char] ← TRUE;
follow[Ascii.Lower[z.char]] ← TRUE;
looksStart ← TextLooksSupport.LooksOR[z.looks, looksFollow];
nodeBreakStart ← nodeBreak;
};
z: REF ParseTreeContent.class => {
IF follow^ = emptyCharClass THEN
np ← NEW[ParseTreeContent.skipOverClass ← [skipOverClass[z.class, z.looks, z.classNumber]]]
ELSE {
thisClass: CharClass ← NEW[CharClassContent ← z.class^];
rest: CharClass ← NEW[CharClassContent ← follow^];
CharClassIntersection[rest, thisClass];
CharClassDifference[thisClass, rest];
IF rest^ = emptyCharClass THEN
np ← SkipOverClassNode[thisClass, z.looks]
ELSE IF thisClass^ # emptyCharClass THEN {
q: ParseTree ← SkipOverClassNode[thisClass, z.looks];
np ← NEW[ParseTreeContent.concat ←
[concat[LIST[q, Make[NEW[ParseTreeContent.concat ← [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]];
}
ELSE
np ← Make[p];
CharClassUnion[follow, thisClass];
};
CharClassUnion[follow, z.class];
looksStart ← TextLooksSupport.LooksOR[z.looks, looksFollow];
nodeBreakStart ← nodeBreak;
};
ENDCASE => {
xp: ParseTree;
thisOneMatchesNull: BOOL;
[xp, looksStart, nodeBreakStart, thisOneMatchesNull] ← Optimize[forwards, p, follow, looksFollow, nodeBreak];
IF thisOneMatchesNull THEN
np ← MakeCareful[xp]
ELSE
np ← Make[xp];
looksStart ← TextLooksSupport.LooksOR[looksStart, looksFollow];
IF nodeBreakStart = none THEN
nodeBreakStart ← nodeBreak
ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN
nodeBreakStart ← both;
};
};
OptimizeSkipTo: PROC [x: REF ParseTreeContent.skipTo] RETURNS [ParseTree] = {
IF x.p.type = concat THEN {
q: REF ParseTreeContent.concat ← NARROW[x.p];
FOR l: LIST OF ParseTree ← q.concats, l.rest UNTIL l = NIL DO
SELECT l.first.type FROM
char, charIC, class, anyChar => NULL;
ENDCASE => RETURN[x];
ENDLOOP;
RETURN[NEW[ParseTreeContent.skipToString ← [skipToString[x.p]]]];
};
RETURN[x];
};
ClassType: TYPE = {wholeSet, char, charIC, notChar, notCharIC};
AnalyzeClass: PROC [x: CharClass] RETURNS [kind: ClassType, c: CHAR] = {
classPopulation: NAT = LegalInputCharacters.LAST - LegalInputCharacters.FIRST + 1;
lastC, lastNotC: CHAR;
population: NAT ← 0;
c ← 'a;
FOR chars: CHAR IN LegalInputCharacters DO
IF x[chars] THEN {
population ← population + 1;
lastC ← chars;
}
ELSE
lastNotC ← chars;
ENDLOOP;
SELECT population FROM
0 => ERROR;
1 => {
kind ← char;
c ← lastC;
};
2 => {
IF lastC IN ['a..'z] AND x[Ascii.Upper[lastC]] THEN {
kind ← charIC;
c ← Ascii.Upper[lastC];
}
ELSE
kind ← wholeSet;
};
classPopulation-2 => {
IF lastNotC IN ['a..'z] AND ~x[Ascii.Upper[lastNotC]] THEN {
kind ← notCharIC;
c ← Ascii.Upper[lastNotC];
}
ELSE
kind ← wholeSet;
};
classPopulation-1 => {
kind ← notChar;
c ← lastNotC;
};
classPopulation => ERROR;
ENDCASE => kind ← wholeSet;
};
SkipOverClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = {
classType: ClassType;
c: CHAR;
[classType, c] ← AnalyzeClass[class];
SELECT classType FROM
wholeSet => {
p ← NEW[ParseTreeContent.skipOverClass ← [skipOverClass[NEW[CharClassContent ← class^], looks, numCharClasses]]];
charClassList ← CONS[p, charClassList];
numCharClasses ← numCharClasses + 1;
};
char => p ← NEW[ParseTreeContent.skipOverChar ← [skipOverChar[char: c, looks: looks]]];
charIC => p ← NEW[ParseTreeContent.skipOverCharIC ← [skipOverCharIC[char: c, looks: looks]]];
notChar => p ← NEW[ParseTreeContent.skipToChar ← [skipToChar[char: c, looks: looks]]];
notCharIC => p ← NEW[ParseTreeContent.skipToCharIC ← [skipToCharIC[char: c, looks: looks]]];
ENDCASE => ERROR;
};
MatchClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = {
classType: ClassType;
c: CHAR;
[classType, c] ← AnalyzeClass[class];
SELECT classType FROM
wholeSet, notChar, notCharIC => {
p ← NEW[ParseTreeContent.class ← [class[NEW[CharClassContent ← class^], looks, numCharClasses]]];
charClassList ← CONS[p, charClassList];
numCharClasses ← numCharClasses + 1;
};
char => p ← NEW[ParseTreeContent.char ← [char[c, looks]]];
charIC => p ← NEW[ParseTreeContent.charIC ← [charIC[c, looks]]];
ENDCASE => ERROR;
};
PseudoSet: TYPE = REF ARRAY [0..8) OF CARDINAL;
CharClassUnion: PROC[c1, c2: CharClass] = TRUSTED {
ps1: PseudoSet ← LOOPHOLE[c1];
ps2: PseudoSet ← LOOPHOLE[c2];
FOR i: CARDINAL IN [0..8) DO
ps1[i] ← Basics.BITOR[ps1[i], ps2[i]];
ENDLOOP;
};
CharClassIntersection: PROC[c1, c2: CharClass] = TRUSTED {
ps1: PseudoSet ← LOOPHOLE[c1];
ps2: PseudoSet ← LOOPHOLE[c2];
FOR i: CARDINAL IN [0..8) DO
ps1[i] ← Basics.BITAND[ps1[i], ps2[i]];
ENDLOOP;
};
CharClassDifference: PROC[c1, c2: CharClass] = TRUSTED {
ps1: PseudoSet ← LOOPHOLE[c1];
ps2: PseudoSet ← LOOPHOLE[c2];
FOR i: CARDINAL IN [0..8) DO
ps1[i] ← Basics.BITAND[ps1[i], Basics.BITNOT[ps2[i]]];
ENDLOOP;
c1[0C] ← FALSE;
};
}.