ACFindImpl.mesa
Copyright © 1985 by Xerox Corporation. All rights reserved.
Dave Rumph, June 18, 1986 9:19:34 am PDT
Implements the Aho-Corasick pattern matching algorithm.
DIRECTORY
IO USING [Backup, GetChar, EndOfStream, RIS, STREAM],
Process USING [CheckForAbort],
RefTab USING [Create, EachPairAction, Fetch, Pairs, Ref, Store],
Rope USING [ActionType, Length, Map, ROPE],
ACFind;
ACFindImpl: CEDAR PROGRAM
IMPORTS IO, Process, RefTab, Rope
EXPORTS ACFind
~ BEGIN OPEN ACFind;
ROPE: TYPE ~ Rope.ROPE;
STREAM: TYPE ~ IO.STREAM;
Building FSA's
Create: PUBLIC PROC [keys: LIST OF ROPE, caseSensitive: BOOLFALSE] RETURNS [self: Ref] ~ {
Build the FSA, creating a new instance if required.
ENABLE ABORTED => GOTO Done;
goto: RefTab.Ref ← RefTab.Create[];
SymbolEntry: TYPE ~ RECORD [
state: State,
c: CHAR
];
Goto: PROC [state: State, c: CHAR] RETURNS [next: State] ~ {
val: REF LIST OF SymbolEntry ← NARROW[goto.Fetch[state].val];
IF val = NIL THEN RETURN [NIL];
FOR list: LIST OF SymbolEntry ← val^, list.rest UNTIL list = NIL DO
IF c = list.first.c THEN RETURN [list.first.state];
ENDLOOP;
RETURN [NIL];
};
MakeGoto: PROC [state: State, c: CHAR, next: State] ~ {
Append: PROC [l: REF LIST OF SymbolEntry, item: SymbolEntry] ~ {
tail: LIST OF SymbolEntry;
IF l = NIL THEN {
l ← NEW[LIST OF SymbolEntry];
[] ← RefTab.Store[goto, state, l];
};
IF l^ = NIL THEN { l^ ← LIST[item]; RETURN };
FOR tail ← l^, tail.rest UNTIL tail.rest = NIL DO ENDLOOP;
tail.rest ← LIST[item];
};
Append[NARROW[goto.Fetch[state].val], [next, c]];
};
BuildGoto: PROC [keys: LIST OF ROPE] ~ {
Builds the Goto table of the non-deterministic FSA, to be traversed to build the Move table for the deterministic FSA.
Enter: PROC [key: ROPE] ~ {
ris: STREAMIO.RIS[key];
this: State ← self.initialState;
next: State;
{
ENABLE IO.EndOfStream => GOTO Done;
c: CHAR;
UNTIL (next ← Goto[this, c ← GetChar[self, ris]]) = NIL DO
this ← next;
ENDLOOP;
Backup[ris, c];
WHILE TRUE DO
MakeGoto[this, GetChar[self, ris], (next ← NEW[StateRep])];
next.move ← NEW[MoveRep[LAST[CHAR]-FIRST[CHAR]+1]];
this ← next;
ENDLOOP;
EXITS Done => {
IF this = NIL THEN this ← NEW[StateRep];
this.keys ← CONS[key, this.keys];
};
};
};
FOR list: LIST OF ROPE ← keys, list.rest UNTIL list = NIL DO
Enter[list.first];
ENDLOOP;
FOR c: CHAR IN CHAR DO
IF Goto[self.initialState, c] = NIL THEN MakeGoto[self.initialState, c, self.initialState];
ENDLOOP;
};
Traverse: PROC [] ~ {
q: LIST OF State ← NIL;
Append: PROC [q: LIST OF State, new: State] RETURNS [val: LIST OF State] ~ {
temp: LIST OF State ← NIL;
val ← LIST[new];
IF q = NIL THEN RETURN [val];
val ← CONS[q.first, val];
temp ← val;
UNTIL (q ← q.rest) = NIL DO
temp.rest ← CONS[q.first, temp.rest];
temp ← temp.rest;
ENDLOOP;
RETURN[val];
};
MakeMove: PROC [state: State, input: CHAR, next: State] ~ INLINE {
state.move[input-state.offset] ← next;
};
NilFailures: RefTab.EachPairAction ~ {
this: State ← NARROW[key];
this.failure ← NIL;
RETURN [FALSE];
};
CondenseMoves: RefTab.EachPairAction ~ {
this: State ← NARROW[key];
newOffset: CHAR ← this.offset;
lastChar: CHAR;
newMove: REF MoveRep;
IF this.failure # NIL THEN RETURN [FALSE];
this.failure ← Move[this, LAST[CHAR]];
WHILE Move[this, newOffset] = this.failure DO
newOffset ← newOffset + 1;
ENDLOOP;
lastChar ← LAST[CHAR];
WHILE Move[this, lastChar] = this.failure DO
lastChar ← lastChar - 1;
ENDLOOP;
newMove ← NEW[MoveRep[lastChar-newOffset+1]];
FOR c: CHAR IN [newOffset..lastChar] DO
newMove[c-newOffset] ← Move[this, c];
ENDLOOP;
this.offset ← newOffset;
this.move ← newMove;
RETURN [FALSE];
};
activeState: State;
FOR c: CHAR IN CHAR DO
MakeMove[self.initialState, c, Goto[self.initialState, c]];
IF (activeState ← Goto[self.initialState, c]) # self.initialState THEN {
q ← Append[q, activeState];
activeState.failure ← self.initialState;
};
ENDLOOP;
WHILE q # NIL DO
this: State ← q.first;
next: State;
q ← q.rest;
FOR c: CHAR IN CHAR DO
IF (next ← Goto[this, c]) # NIL THEN {
temp: State ← this.failure;
WHILE Goto[temp, c] = NIL DO
temp ← temp.failure;
ENDLOOP;
q ← Append[q, next];
next.failure ← Goto[temp, c];
FOR list: LIST OF ROPE ← next.failure.keys, list.rest UNTIL list = NIL DO
next.keys ← CONS[list.first, next.keys]
ENDLOOP;
MakeMove[this, c, next];
}
ELSE MakeMove[this, c, Move[this.failure, c]];
ENDLOOP;
ENDLOOP;
[] ← RefTab.Pairs[goto, NilFailures];
[] ← RefTab.Pairs[goto, CondenseMoves];
};
self ← NEW[Rep];
self.initialState ← NEW[StateRep];
IF keys = NIL THEN {
Return a do-nothing Finder
self.initialState.failure ← self.initialState;
self.initialState.offset ← '\377;
self.initialState.move ← NEW[MoveRep[0]];
RETURN[self];
};
self.initialState.move ← NEW[MoveRep[LAST[CHAR]-FIRST[CHAR]+1]];
self.caseSensitive ← caseSensitive;
BuildGoto[keys];
Traverse[];
EXITS Done => NULL;
};
Searching
Move: PROC [state: State, input: CHAR] RETURNS [State] ~ INLINE {
Find the next state, given the current state and the input character
IF NOT (input IN [state.offset..state.offset + state.move.l)) THEN RETURN [state.failure];
RETURN[state.move[input - state.offset]];
};
Find: PUBLIC PROC [self: Ref, text: ROPE, action: ActionProc] RETURNS [foundAny: BOOLEANFALSE] ~ {
Step through the text, calling the ActionProc for each match.
pos: INT ← 0;
MoveAction: Rope.ActionType ~ {
ENABLE ABORTED => GOTO Aborted;
Process.CheckForAbort[];
IF NOT self.caseSensitive THEN c ← toUpper[c];
state ← Move[state, c];
pos ← pos + 1;
IF state.keys # NIL THEN {
foundAny ← TRUE;
FOR list: LIST OF ROPE ← state.keys, list.rest UNTIL list = NIL DO
quit ← action[pos, list.first] OR quit;
ENDLOOP;
};
EXITS Aborted => RETURN [TRUE];
};
state: State ← self.initialState;
[] ← Rope.Map[text, 0, text.Length, MoveAction];
};
Utilities
lastChar: CHAR;
toUpper: ARRAY CHAR OF CHAR;
GetChar: PROC [self: Ref, stream: STREAM] RETURNS [CHAR] ~ {
Return the next character, converting to upper case if not self.caseSensitive
IF self.caseSensitive THEN RETURN [lastChar ← IO.GetChar[stream]];
RETURN[toUpper[lastChar ← IO.GetChar[stream]]];
};
Backup: PROC [s: STREAM, c: CHAR] ~ {
IO.Backup[s, lastChar];
};
Initialization
FOR c: CHAR IN CHAR DO
toUpper[c] ← IF c IN ['a..'z] THEN c - ('a - 'A) ELSE c;
ENDLOOP;
END.