DIRECTORY
Ascii USING [Lower, Upper],
List USING [Append, Nconc, Reverse],
LooksReader USING [Create, Get, SetPosition],
RegularExpression USING [altSepToken, anyToken, beginAllToken, beginAltToken, beginClassToken, beginFieldToken, beginNodeToken, boundSepToken, CharClass, CharClassContent, ClassArray, closureToken, Compile, endAllToken, endAltToken, endClassToken, endFieldToken, endPatternToken, fieldSepToken, FinderRecord, greedyClosureToken, greedyPlusToken, IgnoreLooks, Index, LegalInputCharacters, NameArray, nodeBreakToken, notToken, OptimizeBackwardSearch, OptimizeForwardSearch, ParseTree, ParseTreeContent, ParseTypes, PatternErrorCode, PatternStackArray, plusToken, powerToken, ReturnCodeArray, StackContent, subRangeToken, TextStackArray],
Rope USING [Concat, Equal, Fetch, FromChar, ROPE, Size],
RopeReader USING [Create, Get, GetIndex, ReadOffEnd, SetPosition],
RunReader USING [NoMoreRuns],
RuntimeError USING [BoundsFault],
TextEdit USING [GetRope, GetRuns],
TextLooks USING [Looks, noLooks, Runs],
TextNode USING [Offset, RefTextNode];
CreateFromParts:
PROC [patternRope:
ROPE, patternRuns: TextLooks.Runs, literal, word, ignoreLooks, ignoreCase, addBounds:
BOOLEAN, patternStart: Offset, patternLen: Offset]
RETURNS [finder: Finder] = {
ENABLE RuntimeError.BoundsFault => ERROR MalformedPattern[toobig];
SimpleSymbolTableEntry: TYPE = RECORD[name: ROPE, number: Index];
SimpleSymbolTable: TYPE = LIST OF REF SimpleSymbolTableEntry;
nameList: SimpleSymbolTable;
numberOfFields: Index;
parsedPatternList: LIST OF ParseTree;
forwardPattern, backwardPattern: ParseTree;
char, patternChar: CHAR ← 377C;
pEnd, pPos: Offset;
numCharClasses: Index ← 0;
charClassList: LIST OF ParseTree ← NIL;
insideNamedPat: BOOLEAN ← FALSE;
lastPhysicalCharUnread: BOOL ← FALSE;
theLastPhysicalCharUnread: CHAR;
inAbbreviation: BOOL ← FALSE;
abbreviationPos: Offset;
abbreviation: ROPE;
lastCharacterRead: CHAR;
looksRead: TextLooks.Looks ← TextLooks.noLooks;
GetNextChar:
PROC [eofOK:
BOOL]
RETURNS [c:
CHAR] = {
IF lastPhysicalCharUnread
THEN {
lastPhysicalCharUnread ← FALSE;
RETURN[theLastPhysicalCharUnread];
};
IF inAbbreviation
THEN {
abbreviationPos ← abbreviationPos + 1;
IF abbreviationPos < abbreviation.Size[]
THEN c ← abbreviation.Fetch[abbreviationPos]
ELSE inAbbreviation ← FALSE;
};
IF ~inAbbreviation
THEN {
IF pPos >= pEnd THEN GOTO gotEnd;
pPos ← pPos + 1;
c ← finder.ropeReader.Get[! RopeReader.ReadOffEnd => GOTO gotEnd];
IF finder.lksReader =
NIL
THEN looksRead ← IgnoreLooks
ELSE looksRead ← LooksReader.Get[finder.lksReader
! RunReader.NoMoreRuns => {looksRead ← TextLooks.noLooks; CONTINUE }];
};
IF ignoreCase THEN c ← Ascii.Upper[c];
EXITS
gotEnd => {
c ← endPatternToken;
looksRead ← TextLooks.noLooks;
IF ~eofOK THEN SyntaxError[unexpectedEndOfPattern];
};
};
UnReadLastPhysicalChar:
PROC [c:
CHAR] = {
IF lastPhysicalCharUnread THEN ERROR;
theLastPhysicalCharUnread ← c;
lastPhysicalCharUnread ← TRUE;
};
AbbreviationRec: TYPE = RECORD[char: CHAR, abbreviation: ROPE];
abbreviations:
LIST
OF AbbreviationRec ←
LIST[
['A, "([a..zA..Z0..9]++)"],
['B, "([ '011..'015]++)"],
['D, "([0..9]+.[0..9]**|[0..9]*.[0..9]++|[0..9]++)"],
['N, "('015)"],
['Q, "(\"[~\"]*\"|``[~'']*''''|`[~'']*'')"],
['S, "([~ '011..'015]++)"],
['W, "([a..zA..Z]++)"],
['^, "['001..'037]"]
];
SetUpAbbreviation:
PROC [c:
CHAR] = {
IF inAbbreviation THEN ERROR;
c ← Ascii.Upper[c];
FOR l:
LIST
OF AbbreviationRec ← abbreviations, l.rest
UNTIL l =
NIL
DO
IF c = l.first.char
THEN {
inAbbreviation ← TRUE;
abbreviationPos ← -1;
abbreviation ← l.first.abbreviation;
RETURN;
};
ENDLOOP;
SyntaxError[unknownAbbreviation];
};
charUnRead: BOOL ← FALSE;
GetToken:
PROC []
RETURNS [char:
CHAR] = {
IF charUnRead
THEN {
charUnRead ← FALSE;
RETURN[lastCharacterRead];
};
lastCharacterRead ← char ← GetNextChar[TRUE];
IF literal
THEN
IF char = endPatternToken OR char IN LegalInputCharacters
THEN RETURN[char] ELSE SyntaxError[illegalCharacter];
IF char IN ['A..'Z] OR char IN ['a..'z] OR char IN ['0..'9] THEN RETURN[char];
SELECT char
FROM
'[ => char ← beginClassToken;
'] => char ← endClassToken;
'~ => char ← notToken;
'# => char ← anyToken;
'$ => char ← nodeBreakToken;
'^ => char ← beginNodeToken;
'! => char ← powerToken;
'( => char ← beginAltToken;
') => char ← endAltToken;
'| => char ← altSepToken;
'< => char ← beginFieldToken;
'> => char ← endFieldToken;
': => char ← fieldSepToken;
', => char ← boundSepToken;
'{ => char ← beginAllToken;
'} => char ← endAllToken;
'' => {
char ← GetNextChar[FALSE];
IF char
IN ['0..'7]
THEN {
c2: CHAR ← GetNextChar[FALSE];
c3: CHAR ← GetNextChar[FALSE];
octalIndex: CARDINAL;
IF ~(c2
IN ['0..'7])
OR ~(c3
IN ['0..'7])
THEN
SyntaxError[illegalOctalNumber];
octalIndex ← (char-'0)*64 + (c2-'0)*8+(c3-'0);
IF octalIndex > 127 THEN SyntaxError[illegalOctalNumber];
char ← VAL[octalIndex];
};
IF ~(char IN LegalInputCharacters) THEN SyntaxError[illegalCharacter];
};
'* => {
c: CHAR ← GetNextChar[TRUE];
IF c = '*
THEN
char ← greedyClosureToken
ELSE {
UnReadLastPhysicalChar[c];
char ← closureToken;
};
};
'+ => {
c: CHAR ← GetNextChar[TRUE];
IF c = '+
THEN
char ← greedyPlusToken
ELSE {
UnReadLastPhysicalChar[c];
char ← plusToken;
};
};
'. => {
c: CHAR ← GetNextChar[TRUE];
IF c = '.
THEN
char ← subRangeToken
ELSE {
UnReadLastPhysicalChar[c];
char ← '.;
};
};
'\\ => {
c: CHAR ← GetNextChar[FALSE];
SetUpAbbreviation[c];
RETURN[GetToken[]];
};
endPatternToken => NULL;
ENDCASE => IF ~(char IN LegalInputCharacters) THEN SyntaxError[illegalCharacter];
lastCharacterRead ← char;
RETURN[char];
};
UnReadToken:
PROC[] = {
IF charUnRead THEN ERROR;
charUnRead ← TRUE;
};
SyntaxError:
PROC[kind: RegExpPatternErrorCode] = {
ERROR MalformedPattern[toobig];
MalformedPattern[kind, MAX[0, IF lastPhysicalCharUnread THEN pPos-2 ELSE pPos-1]];
};
Parses the character class notation, which in its simplest form is a sequence of characters between []'s, e.g. [0123456789], which specifies a pattern that will match any character in the sequence. Ranges of characters may be specified with .., e.g. [A..F] is the same as [ABCDEF]. Special characters can and must be quoted, e.g. ['(')''] will match either a left parenthesis, a right parenthesis, or a quote. All characters but those in the class may be matched by using a ~ as the first symbol in the class, e.g. [~A..Za..z0..9] will match all but the alphanumeric characters. If ignoreCase is true, then if the set includes a lower case 'x, it will be made to also include the upper case 'X, and vice-versa.
ParseCharClass:
PROC []
RETURNS [r: ParseTree] = {
ccr: REF ParseTreeContent.class ← NEW[ParseTreeContent.class];
complement: BOOL ← FALSE;
c, lastChar: CHAR ← beginClassToken;
charClass: CharClass ← NEW[CharClassContent ← ALL[FALSE]];
charClassList ← CONS[ccr, charClassList];
ccr.classNumber ← numCharClasses;
numCharClasses ← numCharClasses + 1;
IF GetToken[] = notToken
THEN
complement ← TRUE
ccr.looks ← looksRead;
ccr.class ← charClass;
r ← ccr;
WHILE (c ← GetToken[]) # endClassToken
DO
SELECT c
FROM
subRangeToken => {
IF ~(lastChar IN LegalInputCharacters) THEN SyntaxError[illegalCharacter];
c ← GetToken[];
IF ~(c IN LegalInputCharacters) THEN SyntaxError[illegalCharacter];
FOR x:
CHAR
IN [lastChar..c]
DO
charClass[x] ← TRUE;
ENDLOOP;
lastChar ← subRangeToken;
};
NOT
IN LegalInputCharacters =>
SyntaxError[illegalCharacter]
ENDCASE => {
lastChar ← c;
charClass[c] ← TRUE;
};
ENDLOOP;
IF ignoreCase
THEN
FOR x:
CHAR
IN ['a..'z]
DO
SELECT
TRUE
FROM
charClass[x] => charClass[x-'a+'A] ← TRUE;
charClass[x-'a+'A] => charClass[x-'a+'A] ← TRUE;
ENDCASE => charClass[x] ← TRUE;
ENDLOOP;
IF complement
THEN
FOR x:
CHAR
IN LegalInputCharacters
DO
charClass[x] ← ~charClass[x];
ENDLOOP;
};
X ::=
non-special character Not one of '[]~#$*+(){}<>\ ..
'special character One of the above. (Handled in tokenizer)
'nnn octal control characters. (Handled in tokenizer)
[character class] Character class notation, A..Z means ASCII interval A..Z
[~character class] Not the characters in this class.
# Any character.
$ Node break.
^ Beginning of node.
\x Predefined patterns; x is an alphanumeric character, or <string>.
(Handled in tokenizer)
ParseX:
PROC []
RETURNS [p: ParseTree ←
NIL] = {
c: CHAR ← GetToken[];
SELECT c
FROM
IN LegalInputCharacters =>
IF ignoreCase
AND c
IN ['A..'Z]
THEN
p ← NEW[ParseTreeContent.charIC ← [charIC[c, looksRead]]]
ELSE
p ← NEW[ParseTreeContent.char ← [char[c, looksRead]]];
beginClassToken =>
p ← ParseCharClass[];
anyToken =>
p ← NEW[ParseTreeContent.anyChar ← [anyChar[looksRead]]];
nodeBreakToken =>
p ← NEW[ParseTreeContent.nodeBreak];
beginNodeToken =>
p ← NEW[ParseTreeContent.beginNode];
ENDCASE => UnReadToken[];
};
Z ::=
X A single-character pattern.
(P|P|...|P) Alternation.
ParseZ:
PROC []
RETURNS [p: ParseTree ←
NIL] = {
c: CHAR ← GetToken[];
SELECT c
FROM
beginAltToken => {
l: LIST OF ParseTree ← NIL;
DO
q: ParseTree ← ParseP[];
IF q = NIL THEN q ← NEW[ParseTreeContent.noOp];
l ← CONS[q, l];
c ← GetToken[];
SELECT c
FROM
endAltToken => EXIT;
altSepToken => {};
ENDCASE => SyntaxError[improperAltSeparator];
ENDLOOP;
SELECT
TRUE
FROM
l = NIL => RETURN [NIL];
l.rest = NIL => RETURN [l.first];
ENDCASE =>
TRUSTED {
RETURN[NEW[ParseTreeContent.alt ← [alt[LOOPHOLE[List.Reverse[LOOPHOLE[l]]]]]]];
};
};
endAltToken, altSepToken => {
UnReadToken[];
RETURN[NIL]
}
ENDCASE => {
UnReadToken[];
p ← ParseX[];
};
};
The pattern matching fragments for full patterns.
P ::=
null The empty pattern.
Z
Z*P Min-matching closure.
Z**P Greedy closure.
Z+P PP*
Z++P PP**
~ZP Deterministically match anything up to but not including P.
ParseP:
PROC []
RETURNS [ParseTree] = {
l: LIST OF ParseTree ← NIL;
p: ParseTree;
DO
c: CHAR ← GetToken[];
IF c = notToken
THEN
p ← NEW[ParseTreeContent.skipTo ← [skipTo[ParseZ[]]]]
ELSE {
UnReadToken[];
p ← ParseZ[];
IF p = NIL THEN EXIT;
c ← GetToken[];
SELECT c
FROM
closureToken =>
p ← NEW[ParseTreeContent.closure ← [closure[p]]];
greedyClosureToken =>
p ← NEW[ParseTreeContent.greedyClosure ← [greedyClosure[p]]];
plusToken =>
p ← NEW[ParseTreeContent.concat ← [concat[LIST[p, NEW[ParseTreeContent.closure ← [closure[p]]]]]]];
greedyPlusToken =>
p ← NEW[ParseTreeContent.concat ← [concat[LIST[p, NEW[ParseTreeContent.greedyClosure ← [greedyClosure[p]]]]]]];
powerToken => {
l: LIST OF ParseTree ← NIL;
iterations: Index ← ParseNumber[FALSE];
WHILE iterations > 0
DO
l ← CONS[p, l];
iterations ← iterations - 1;
ENDLOOP;
p ← NEW[ParseTreeContent.concat ← [concat[l]]];
};
ENDCASE => UnReadToken[];
};
IF p.type = concat
THEN {
pp: REF ParseTreeContent.concat ← NARROW[p];
FOR ll:
LIST
OF ParseTree ← pp.concats, ll.rest
UNTIL ll =
NIL
DO
l ← CONS[ll.first, l];
ENDLOOP;
}
IF l.first.type = closure
THEN {
q: REF ParseTreeContent.closure ← NARROW[l.first];
IF q.p.type = closure
OR q.p.type = greedyClosure
THEN
l.first ← q.p;
}
ELSE
IF l.first.type = greedyClosure
THEN {
q: REF ParseTreeContent.greedyClosure ← NARROW[l.first];
IF q.p.type = closure
THEN {
qq: REF ParseTreeContent.closure ← NARROW[q.p];
q.p ← qq.p;
}
ELSE
IF q.p.type = greedyClosure
THEN
l.first ← q.p;
};
ENDLOOP;
SELECT
TRUE
FROM
l = NIL => RETURN[NIL];
l.rest = NIL => RETURN[l.first];
ENDCASE =>
TRUSTED {
RETURN[NEW[ParseTreeContent.concat ← [concat[LOOPHOLE[List.Reverse[LOOPHOLE[l]]]]]]];
};
};
A simple A-list symbol table that maps field names to numbers.
The top level syntax.
T ::=
<name:P>T Named portions. Valid only at top level. Reserved name ALL.
<name,number:P>T Bound the number of CR's matched by P to number.
PT Concatenation.
P Or just a pattern.
The tipity-top level syntax.
TT ::=
T
T{T}T Delimits the virtual start and end of the text matched by <ALL>.
ParseTopLevel:
PROC []
RETURNS [l:
LIST
OF ParseTree ←
NIL, nameList: SimpleSymbolTable ←
NIL, numberFields: Index ← 0] = {
seenBeginAllToken, seenEndAllToken: BOOL ← FALSE;
DO
c: CHAR ← GetToken[];
SELECT c
FROM
beginFieldToken => {
field: ParseTree ← NIL;
[field, nameList, numberFields] ← ParseField[nameList, numberFields];
l ← CONS[field, l];
};
endFieldToken => SyntaxError[notInsideField];
beginAllToken => {
IF seenBeginAllToken THEN SyntaxError[moreThanOneBeginAll];
seenBeginAllToken ← TRUE;
l ← CONS[NEW[ParseTreeContent.beginALL], l];
};
endAllToken => {
IF ~seenBeginAllToken
OR seenEndAllToken
THEN
SyntaxError[noMatchingBeginAll];
seenEndAllToken ← TRUE;
l ← CONS[NEW[ParseTreeContent.endALL], l];
};
endPatternToken => {
IF ~seenBeginAllToken
THEN
l ← CONS[NEW[ParseTreeContent.endALL], l];
IF seenBeginAllToken
AND ~seenEndAllToken
THEN
SyntaxError[noClosingEndAll];
l ← CONS[NEW[ParseTreeContent.endAll], l];
EXIT;
};
ENDCASE => {
p: ParseTree;
UnReadToken[];
p ← ParseP[];
IF p = NIL THEN SyntaxError[illegalCharacter];
WITH p
SELECT
FROM
z:
REF ParseTreeContent.concat =>
TRUSTED {
l ← LOOPHOLE[List.Nconc[List.Reverse[LOOPHOLE[z.concats]], LOOPHOLE[l]]];
};
ENDCASE => l ← CONS[p, l];
};
ENDLOOP;
TRUSTED {l ← LOOPHOLE[List.Reverse[LOOPHOLE[l]]]};
IF ~seenBeginAllToken
THEN
l ← CONS[NEW[ParseTreeContent.beginALL], l];
l ← CONS[NEW[ParseTreeContent.beginAll], l];
};
ParseField:
PROC [names: SimpleSymbolTable, number: Index]
RETURNS [field: ParseTree, newNames: SimpleSymbolTable, newNumber: Index] = {
This routine parses a field, which is of the form <name,bound:pattern>. The name is a sequence of alphabetic characters; the bound is a non-negative integer, and the pattern is a P. The bound and pattern are optional. By default, there is no bound and the pattern is #* with whatever looks the name began with.
s: ParseTree ← NIL;
name: ROPE ← NIL;
c: CHAR ← Ascii.Lower[GetToken[]];
nameLooks: TextLooks.Looks ← looksRead;
bound: INT ← -1;
newNames ← names;
DO
IF ~(c IN ['a..'z]) THEN EXIT;
name ← Rope.Concat[name, Rope.FromChar[c]];
c ← Ascii.Lower[GetToken[]];
ENDLOOP;
IF name = NIL THEN SyntaxError[nameMustBeAString];
IF Rope.Equal[name, "all",
FALSE]
THEN
SyntaxError[theAllNameIsReserved];
FOR l: SimpleSymbolTable ← names, l.rest
UNTIL l =
NIL
DO
IF Rope.Equal[name, l.first.name,
FALSE]
THEN {
IF c # endFieldToken
THEN
SyntaxError[secondOccurenceOfFieldMustNotContainPattern];
IF ignoreCase
THEN
field ← NEW[ParseTreeContent.fieldEqualsIC ← [fieldEqualsIC[l.first.number]]]
ELSE
field ← NEW[ParseTreeContent.fieldEquals ← [fieldEquals[l.first.number]]];
RETURN[field, names, number];
};
ENDLOOP;
newNumber ← number + 1;
newNames ← CONS[NEW[SimpleSymbolTableEntry ← [name, newNumber]], newNames];
IF c = boundSepToken
THEN
bound ← ParseNumber[FALSE];
SELECT c
FROM
fieldSepToken => {
s ← ParseP[];
IF GetToken[] # endFieldToken THEN SyntaxError[expectedEndOfField];
};
endFieldToken => {
s ← NEW[ParseTreeContent.closure ← [closure[NEW[ParseTreeContent.anyChar ← [anyChar[nameLooks]]]]]];
};
ENDCASE => SyntaxError[expectedEndOfField];
field ← NEW[ParseTreeContent.field ← [field[newNumber, bound, s]]];
};
ParseNumber:
PROC [octal:
BOOL]
RETURNS [number: Index ← 0] = {
c: CHAR;
IF octal
THEN
WHILE (c ← GetToken[])
IN ['0..'7]
DO
number ← 8*number + c - '0;
ENDLOOP
ELSE
WHILE (c ← GetToken[])
IN ['0..'9]
DO
number ← 10*number + c - '0;
ENDLOOP;
IF c # '. THEN UnReadToken[];
};
IF addBounds
THEN patternRope ← Rope.Concat["^", Rope.Concat[patternRope, "$"]];
pEnd ← MIN[Rope.Size[patternRope], patternStart+patternLen];
patternStart ← MIN[patternStart,pEnd];
pPos ← patternStart;
finder ← NEW[FinderRec];
IF word
THEN finder.wordSearch ←
TRUE;
so Try will know to make sure don't have adjacent alphanumerics
finder.ropeReader ← RopeReader.Create[];
RopeReader.SetPosition[finder.ropeReader, patternRope, patternStart];
IF patternRuns #
NIL
AND ~ignoreLooks
THEN {
finder.lksReader ← LooksReader.Create[];
LooksReader.SetPosition[finder.lksReader, patternRuns, patternStart]
}
ELSE
finder.lksReader ← NIL;
[parsedPatternList, nameList, numberOfFields] ← ParseTopLevel[];
finder.nameArray ← NEW[NameArray[numberOfFields+1]];
FOR l: SimpleSymbolTable ← nameList, l.rest
UNTIL l =
NIL
DO
finder.nameArray[l.first.number].name ← l.first.name;
ENDLOOP;
forwardPattern ←
NEW[ParseTreeContent.concat ←
[concat[
CONS[
NEW[ParseTreeContent.closure ←
[closure[NEW[ParseTreeContent.anyChar ← [anyChar[IgnoreLooks]]]]]], parsedPatternList]]]];
[forwardPattern, charClassList, numCharClasses] ← RegularExpression.OptimizeForwardSearch[forwardPattern, charClassList, numCharClasses];
TRUSTED {
backwardPattern ←
NEW[ParseTreeContent.concat ←
[concat[
LOOPHOLE[List.Append[
LOOPHOLE[parsedPatternList],
LOOPHOLE[
LIST[
NEW[ParseTreeContent.closure ←
[closure[NEW[ParseTreeContent.anyChar ← [anyChar[IgnoreLooks]]]]]]]]]]]]];
};
[backwardPattern, charClassList, numCharClasses] ← RegularExpression.OptimizeBackwardSearch[backwardPattern, charClassList, numCharClasses];
finder.classes ← NEW[ClassArray[numCharClasses]];
FOR l:
LIST
OF ParseTree ← charClassList, l.rest
UNTIL l =
NIL
DO
WITH l.first
SELECT
FROM
x: REF ParseTreeContent.class => finder.classes[x.classNumber] ← x.class;
x: REF ParseTreeContent.skipOverClass => finder.classes[x.classNumber] ← x.class;
ENDCASE => ERROR;
ENDLOOP;
finder.stack ← NEW[StackContent ← [0, NEW[PatternStackArray[100]], NEW[TextStackArray[100]], NEW[ReturnCodeArray[100]]]]; -- Interim hack.
finder.forwardProgram ← RegularExpression.Compile[forwardPattern];
finder.backwardProgram ← RegularExpression.Compile[backwardPattern];
finder.wordSearch ← word;
};
}.