<<RegExpFindOptimizeImpl.mesa>>
        <<Copyright © 1985 by Xerox Corporation.  All rights reserved.>>
        <<Nix, December 21, 1983 5:27 pm>>
        <<Russ Atkinson (RRA) March 11, 1985 7:43:02 pm PST>>

    DIRECTORY
        Ascii USING [Lower, Upper],
        Basics USING [BITAND, BITOR, BITNOT],
        List USING [Append, DReverse, Reverse],
        TextLooks USING [Looks, noLooks, allLooks],
        TextLooksSupport USING [LooksOR],
        RegularExpression USING [CharClass, CharClassContent, Index, LegalInputCharacters, ParseTree, ParseTreeContent, ParseTypes];

    RegExpFindOptimizeImpl: CEDAR MONITOR
        IMPORTS Ascii, List, Basics, TextLooksSupport
        EXPORTS RegularExpression = {
        OPEN RegularExpression;

    emptyCharClass: CharClassContent = ALL[FALSE];
    allCharClass: CharClassContent = [FALSE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
        TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE];

    charClassList: LIST OF ParseTree _ NIL;
    numCharClasses: Index;
    NodeBreakType: TYPE = {beginning, end, both, none};

    OptimizeForwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST 
    OF ParseTree, newNumCcl: Index] = {
        ENABLE UNWIND => NULL;
        follow: CharClass _ NEW[CharClassContent _ allCharClass];
        charClassList _ ccl;
        numCharClasses _ numCcl;
        np _ Optimize[TRUE, p, follow, ALL[TRUE], end].np;
        newCcl _ charClassList;
        charClassList _ NIL;
        newNumCcl _ numCharClasses;
        };

    OptimizeBackwardSearch: PUBLIC ENTRY PROC[p: ParseTree, ccl: LIST OF ParseTree, numCcl: Index] RETURNS [np: ParseTree, newCcl: LIST 
    OF ParseTree, newNumCcl: Index] = {
        ENABLE UNWIND => NULL;
        follow: CharClass _ NEW[CharClassContent _ allCharClass];
        charClassList _ ccl;
        numCharClasses _ numCcl;
        np _ Optimize[FALSE, p, follow, ALL[TRUE], beginning].np;
        newCcl _ charClassList;
        charClassList _ NIL;
        newNumCcl _ numCharClasses;
        };

    Optimize: PROC [forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] RETURNS 
    [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = {
        np _ p;
        matchesNull _ FALSE;
        WITH p SELECT FROM 
            x: REF ParseTreeContent.concat => {
                l: LIST OF ParseTree; 
                [l, looksStart, nodeBreakStart, matchesNull] _ 
                    IF forwards THEN OptimizeConcatForwards[x.concats, follow, looksFollow, nodeBreak]
                    ELSE OptimizeConcatBackwards[x.concats, follow, looksFollow, nodeBreak];
                np _ NEW[ParseTreeContent.concat _ [concat[l]]];
                };
            x: REF ParseTreeContent.alt => {
                ll: LIST OF ParseTree;
                xp: ParseTree;
                xStart: CharClass _ NEW[CharClassContent];
                start: CharClass _ NEW[CharClassContent _ emptyCharClass];
                xLooksStart: TextLooks.Looks;
                xNodeBreakStart: NodeBreakType;
                start^ _ follow^;
                [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.alts.first, start, looksFollow, nodeBreak];
                ll _ CONS[xp, NIL];
                FOR l: LIST OF ParseTree _ x.alts.rest, l.rest UNTIL l = NIL DO
                    oneAltMatchesNull: BOOL _ FALSE;
                    xStart^ _ follow^;
                    [xp, xLooksStart, xNodeBreakStart, oneAltMatchesNull] _ Optimize[forwards, l.first, xStart, looksFollow, nodeBreak];
                    matchesNull _ matchesNull OR oneAltMatchesNull;
                    ll _ CONS[xp, ll];
                    CharClassUnion[start, xStart];
                    looksStart _ TextLooksSupport.LooksOR[looksStart, xLooksStart];
                    IF nodeBreakStart = none THEN
                        nodeBreakStart _ xNodeBreakStart
                    ELSE IF xNodeBreakStart # none AND nodeBreakStart # xNodeBreakStart THEN
                        nodeBreakStart _ both; 
                    ENDLOOP;
                follow^ _ start^;
                TRUSTED {ll _ LOOPHOLE[List.DReverse[LOOPHOLE[ll]]]};
                np _ NEW[ParseTreeContent.alt _ [alt[ll]]];
                };
            x: REF ParseTreeContent.field => {
                xp: ParseTree;
                [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, follow, looksFollow, nodeBreak];
                np _ NEW[ParseTreeContent.field _ [field[x.number, x.bound, xp]]];
                };
            x: REF ParseTreeContent.char => {
                follow^ _ ALL[FALSE];
                follow[x.char] _ TRUE;
                looksStart _ x.looks;
                nodeBreakStart _ none;
                };
            x: REF ParseTreeContent.charIC => {
                follow^ _ ALL[FALSE];
                follow[x.char] _ TRUE;
                follow[Ascii.Lower[x.char]] _ TRUE;
                looksStart _ x.looks;
                nodeBreakStart _ none;
                };
            x: REF ParseTreeContent.class => {
                follow^ _ x.class^;
                looksStart _ x.looks;
                nodeBreakStart _ none;
                };
            x: REF ParseTreeContent.anyChar => {
                follow^ _ allCharClass;
                looksStart _ x.looks;
                nodeBreakStart _ none;
                };
            x: REF ParseTreeContent.nodeBreak => {
                follow^ _ ALL[FALSE];
                looksStart _ TextLooks.noLooks;
                nodeBreakStart _ end;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.beginNode => {
                follow^ _ ALL[FALSE];
                looksStart _ TextLooks.noLooks;
                nodeBreakStart _ beginning;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.succeed => {
                follow^ _ allCharClass;
                looksStart _ TextLooks.allLooks;
                nodeBreakStart _ none;
                };
            x: REF ParseTreeContent.skipToChar => {
                follow^ _ allCharClass;
                looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.skipOverChar => {
                follow[x.char] _ TRUE;
                looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.skipToCharIC => {
                follow^ _ allCharClass;
                looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.skipOverCharIC => {
                follow[x.char] _ TRUE;
                follow[Ascii.Lower[x.char]] _ TRUE;
                looksStart _ TextLooksSupport.LooksOR[x.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.skipTo => {
                xp: ParseTree;
                [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, follow, looksFollow, nodeBreak];
                np _ OptimizeSkipTo[NEW[ParseTreeContent.skipTo _ [skipTo[xp]]]];
                follow^ _ allCharClass;
                looksStart _ TextLooks.allLooks;
                nodeBreakStart _ both;
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.skipOver => {
                xp: ParseTree;
                start: CharClass _ NEW[CharClassContent _ follow^];
                [xp, looksStart, nodeBreakStart, matchesNull] _ Optimize[forwards, x.p, start, looksFollow, nodeBreak];
                np _ NEW[ParseTreeContent.skipOver _ [skipOver[xp]]];
                CharClassUnion[follow, start];
                looksStart _ TextLooksSupport.LooksOR[looksStart, looksFollow];
                IF nodeBreakStart = none THEN
                    nodeBreakStart _ nodeBreak
                ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN
                    nodeBreakStart _ both; 
                matchesNull _ TRUE;
                };
            x: REF ParseTreeContent.closure =>
                [np, looksStart, nodeBreakStart, matchesNull] _ 
                    OptimizeClosure[closure, forwards, x.p, follow, looksFollow, nodeBreak];
            x: REF ParseTreeContent.greedyClosure => 
                [np, looksStart, nodeBreakStart, matchesNull] _ 
                    OptimizeClosure[greedyClosure, forwards, x.p, follow, looksFollow, nodeBreak];
            ENDCASE => {
                SELECT p.type FROM
                    fieldEquals, fieldEqualsIC, fail => {
                        follow^ _ allCharClass;
                        looksStart _ TextLooks.allLooks;
                        nodeBreakStart _ both;
                        matchesNull _ TRUE;
                        };
                    noOp, beginAll, endAll, beginALL, endALL => {
                        <<preserve follow^>>
                        looksStart _ looksFollow;
                        nodeBreakStart _ nodeBreak;
                        matchesNull _ TRUE;
                        };
                    ENDCASE => ERROR;
                };
        };


    OptimizeConcatForwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] 
    RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED {
        [lp, looksStart, nodeBreakStart, matchesNull] _ OptimizeConcat[TRUE, LOOPHOLE[List.Reverse[LOOPHOLE[l]]], follow, looksFollow, 
        nodeBreak];
        };

    OptimizeConcatBackwards: PROC [l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: NodeBreakType] 
    RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL] = TRUSTED {
        [lp, looksStart, nodeBreakStart, matchesNull] _ OptimizeConcat[FALSE, LOOPHOLE[List.Append[LOOPHOLE[l]]], follow, looksFollow, 
        nodeBreak];
        };

    OptimizeConcat: PROC [forwards: BOOL, l: LIST OF ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: 
    NodeBreakType] RETURNS [lp: LIST OF ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL _ 
    TRUE] = {
        looksStart _ looksFollow;
        nodeBreakStart _ nodeBreak;
        lp _ NIL;
        WHILE l # NIL DO
            first: LIST OF ParseTree _ l;
            thisOneMatchesNull: BOOL;
            l _ l.rest;
            first.rest _ lp;
            lp _ first;
            [lp.first, looksStart, nodeBreakStart, thisOneMatchesNull] _ Optimize[forwards, lp.first, follow, looksStart, nodeBreakStart];
            matchesNull _ matchesNull AND thisOneMatchesNull;
            ENDLOOP;
        };

    OptimizeClosure: PROC [kind: ParseTypes, forwards: BOOL, p: ParseTree, follow: CharClass, looksFollow: TextLooks.Looks, nodeBreak: 
    NodeBreakType] RETURNS [np: ParseTree, looksStart: TextLooks.Looks, nodeBreakStart: NodeBreakType, matchesNull: BOOL _ TRUE] = {
        Make: PROC[p: ParseTree] RETURNS [cl: ParseTree] = {
            SELECT kind FROM
                closure => cl _ NEW[ParseTreeContent.closure _ [closure[p]]];
                greedyClosure => cl _ NEW[ParseTreeContent.greedyClosure _ [greedyClosure[p]]];
                ENDCASE => ERROR;
            };
        MakeCareful: PROC[p: ParseTree] RETURNS [cl: ParseTree] = {
            SELECT kind FROM
                closure => cl _ NEW[ParseTreeContent.carefulClosure _ [carefulClosure[p]]];
                greedyClosure => cl _ NEW[ParseTreeContent.carefulGreedyClosure _ [carefulGreedyClosure[p]]];
                ENDCASE => ERROR;
            };
        WITH p SELECT FROM
            z: REF ParseTreeContent.anyChar => {
                IF follow^ = emptyCharClass THEN {
                    IF (nodeBreak = beginning AND forwards) OR (nodeBreak = end AND ~forwards) THEN
                        np _ NEW[ParseTreeContent.noOp]
                    ELSE IF (nodeBreak = end AND forwards) OR (nodeBreak = beginning AND ~forwards)  THEN
                        np _ NEW[ParseTreeContent.skipToNodeBreak _ [skipToNodeBreak[z.looks]]]
                    ELSE
                        np _ Make[p];
                    }
                ELSE IF follow^ # allCharClass THEN {
                    start: CharClass _ NEW[CharClassContent _ allCharClass];
                    rest: CharClass _ NEW[CharClassContent _ follow^];
                    q: ParseTree;
                    CharClassDifference[start, rest];
                    q _ SkipOverClassNode[start, z.looks];
                    np _ NEW[ParseTreeContent.concat _ 
                        [concat[LIST[q, Make[NEW[ParseTreeContent.concat _ [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]];
                    }
                ELSE
                    np _ Make[p];
                follow^ _ allCharClass;
                looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                };
            z: REF ParseTreeContent.char => {
                IF ~follow[z.char] THEN
                    np _ NEW[ParseTreeContent.skipOverChar _ [skipOverChar[z.char, z.looks]]]
                ELSE
                    np _ Make[p];
                follow[z.char] _ TRUE;
                looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                };
            z: REF ParseTreeContent.charIC => {
                IF ~follow[z.char] AND ~follow[Ascii.Lower[z.char]] THEN
                    np _ NEW[ParseTreeContent.skipOverCharIC _ [skipOverCharIC[z.char, z.looks]]]
                ELSE
                    np _ Make[p];

                follow[z.char] _ TRUE;
                follow[Ascii.Lower[z.char]] _ TRUE;
                looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                };
            z: REF ParseTreeContent.class => {
                IF follow^ = emptyCharClass THEN
                    np _ NEW[ParseTreeContent.skipOverClass _ [skipOverClass[z.class, z.looks, z.classNumber]]]
                ELSE {
                    thisClass: CharClass _ NEW[CharClassContent _ z.class^];
                    rest: CharClass _ NEW[CharClassContent _ follow^];
                    CharClassIntersection[rest, thisClass];
                    CharClassDifference[thisClass, rest];
                    IF rest^ = emptyCharClass THEN
                        np _ SkipOverClassNode[thisClass, z.looks]
                    ELSE IF thisClass^ # emptyCharClass THEN {
                        q: ParseTree _ SkipOverClassNode[thisClass, z.looks];
                        np _ NEW[ParseTreeContent.concat _ 
                            [concat[LIST[q, Make[NEW[ParseTreeContent.concat _ [concat[LIST[MatchClassNode[rest, z.looks], q]]]]]]]]];
                        }
                    ELSE
                        np _ Make[p];
                    CharClassUnion[follow, thisClass];
                    };
                CharClassUnion[follow, z.class];
                looksStart _ TextLooksSupport.LooksOR[z.looks, looksFollow];
                nodeBreakStart _ nodeBreak;
                };
            ENDCASE => {
                xp: ParseTree;
                thisOneMatchesNull: BOOL;
                [xp, looksStart, nodeBreakStart, thisOneMatchesNull] _ Optimize[forwards, p, follow, looksFollow, nodeBreak];
                IF thisOneMatchesNull THEN
                    np _ MakeCareful[xp]
                ELSE
                    np _ Make[xp];
                looksStart _ TextLooksSupport.LooksOR[looksStart, looksFollow];
                IF nodeBreakStart = none THEN
                    nodeBreakStart _ nodeBreak
                ELSE IF nodeBreak # none AND nodeBreakStart # nodeBreak THEN
                    nodeBreakStart _ both; 
                };
        };

    OptimizeSkipTo: PROC [x: REF ParseTreeContent.skipTo] RETURNS [ParseTree] = {
        IF x.p.type = concat THEN {
            q: REF ParseTreeContent.concat _ NARROW[x.p];
            FOR l: LIST OF ParseTree _ q.concats, l.rest UNTIL l = NIL DO
                SELECT l.first.type FROM
                    char, charIC, class, anyChar => NULL;
                    ENDCASE => RETURN[x];
                ENDLOOP;
            RETURN[NEW[ParseTreeContent.skipToString _ [skipToString[x.p]]]];
            };
        RETURN[x];
        };


    ClassType: TYPE = {wholeSet, char, charIC, notChar, notCharIC};
    AnalyzeClass: PROC [x: CharClass] RETURNS [kind: ClassType, c: CHAR] = {
        classPopulation: NAT = LegalInputCharacters.LAST - LegalInputCharacters.FIRST + 1;
        lastC, lastNotC: CHAR;
        population: NAT _ 0;
        c _ 'a;
        FOR chars: CHAR IN LegalInputCharacters DO
            IF x[chars] THEN {
                population _ population + 1;
                lastC _ chars;
                }
            ELSE
                lastNotC _ chars;
            ENDLOOP;
        SELECT population FROM
            0 => ERROR;
            1 => {
                kind _ char;
                c _ lastC;
                };
            2 => {
                IF lastC IN ['a..'z] AND x[Ascii.Upper[lastC]] THEN {
                    kind _ charIC;
                    c _ Ascii.Upper[lastC];
                    }
                ELSE 
                    kind _ wholeSet;
                };
            classPopulation-2 => {
                IF lastNotC IN ['a..'z] AND ~x[Ascii.Upper[lastNotC]] THEN {
                    kind _ notCharIC;
                    c _ Ascii.Upper[lastNotC];
                    }
                ELSE 
                    kind _ wholeSet;
                };
            classPopulation-1 => {
                kind _ notChar;
                c _ lastNotC;
                };
            classPopulation => ERROR;
            ENDCASE => kind _ wholeSet;
        };

    SkipOverClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = {
        classType: ClassType;
        c: CHAR;
        [classType, c] _ AnalyzeClass[class];
        SELECT classType FROM 
            wholeSet => {
                p _ NEW[ParseTreeContent.skipOverClass _ [skipOverClass[NEW[CharClassContent _ class^], looks, numCharClasses]]];
                charClassList _ CONS[p, charClassList];
                numCharClasses _ numCharClasses + 1;
                };
            char => p _ NEW[ParseTreeContent.skipOverChar _ [skipOverChar[char: c, looks: looks]]];
            charIC => p _ NEW[ParseTreeContent.skipOverCharIC _ [skipOverCharIC[char: c, looks: looks]]];
            notChar => p _ NEW[ParseTreeContent.skipToChar _ [skipToChar[char: c, looks: looks]]];
            notCharIC => p _ NEW[ParseTreeContent.skipToCharIC _ [skipToCharIC[char: c, looks: looks]]];
            ENDCASE => ERROR;
        };

    MatchClassNode: PROC [class: CharClass, looks: TextLooks.Looks] RETURNS [p: ParseTree] = {
        classType: ClassType;
        c: CHAR;
        [classType, c] _ AnalyzeClass[class];
        SELECT classType FROM 
            wholeSet, notChar, notCharIC => {
                p _ NEW[ParseTreeContent.class _ [class[NEW[CharClassContent _ class^], looks, numCharClasses]]];
                charClassList _ CONS[p, charClassList];
                numCharClasses _ numCharClasses + 1;
                };
            char => p _ NEW[ParseTreeContent.char _ [char[c, looks]]];
            charIC => p _ NEW[ParseTreeContent.charIC _ [charIC[c, looks]]];
            ENDCASE => ERROR;
        };

    PseudoSet: TYPE = REF ARRAY [0..8) OF CARDINAL;
    CharClassUnion: PROC[c1, c2: CharClass] = TRUSTED {
        ps1: PseudoSet _ LOOPHOLE[c1];
        ps2: PseudoSet _ LOOPHOLE[c2];
        FOR i: CARDINAL IN [0..8) DO
            ps1[i] _ Basics.BITOR[ps1[i], ps2[i]];
            ENDLOOP;
        };

    CharClassIntersection: PROC[c1, c2: CharClass] = TRUSTED {
        ps1: PseudoSet _ LOOPHOLE[c1];
        ps2: PseudoSet _ LOOPHOLE[c2];
        FOR i: CARDINAL IN [0..8) DO
            ps1[i] _ Basics.BITAND[ps1[i], ps2[i]];
            ENDLOOP;
        };

    CharClassDifference: PROC[c1, c2: CharClass] = TRUSTED {
        ps1: PseudoSet _ LOOPHOLE[c1];
        ps2: PseudoSet _ LOOPHOLE[c2];
        FOR i: CARDINAL IN [0..8) DO
            ps1[i] _ Basics.BITAND[ps1[i], Basics.BITNOT[ps2[i]]];
            ENDLOOP;
        c1[0C] _ FALSE;
        };
    }.