PatternMatchImpl.mesa
Copyright Ó 1987, 1992 by Xerox Corporation. All rights reserved.
Doug Terry, July 21, 1987 1:16:06 pm PDT
Brian Oki, March 12, 1990 1:54 pm PST
Implements a collection of pattern matching routines of general utility.
Willie-s, April 27, 1992 11:41 am PDT
DIRECTORY
Ascii USING [Upper],
Atom USING [GetProp, PutProp],
BasicTime USING [GMT, Period],
Convert USING [Error, IntFromRope],
IO USING [BreakProc, EndOfStream, GetTokenRope, GetLineRope, TokenProc, GetChar, PeekChar, GetRopeLiteral, Error, STREAM, RIS],
RegularExpression USING [CreateFromRope, Finder, SearchRope],
Rope USING [Cat, Compare, Equal, Find, Index, Fetch, Length, ROPE, SkipTo, Substr],
Soundex USING [NameToCode, SoundexCode],
SymTab USING [Create, Delete, EachPairAction, Fetch, Pairs, Ref, Store],
Tempus USING [Adjust, MakeRope, Parse, Precision, Unintelligible],
PatternMatch;
PatternMatchImpl: CEDAR PROGRAM
IMPORTS Ascii, Atom, BasicTime, Convert, IO, RegularExpression, Rope, Soundex, SymTab, Tempus
EXPORTS PatternMatch
~ BEGIN
ROPE: TYPE ~ Rope.ROPE;
Equal: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Compares the value and pattern for equality.
SELECT Rope.Compare[s1: value, s2: pattern, case: FALSE] FROM
less => match ¬ FALSE;
equal => match ¬ TRUE;
greater => {match ¬ FALSE; nothingGreater ¬ TRUE};
ENDCASE => ERROR;
pparsedNew ¬ $None;
};
Prefix: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Checks if the pattern is a prefix of the value. Prefix[v, "p"] is the same as Wildcard[v, "p*"], though faster.
SELECT Rope.Compare[s1: Rope.Substr[value, 0, Rope.Length[pattern]], s2: pattern, case: FALSE] FROM
less => match ¬ FALSE;
equal => match ¬ TRUE;
greater => {match ¬ FALSE; nothingGreater ¬ TRUE}
ENDCASE => ERROR;
pparsedNew ¬ $None;
};
Wildcard: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
The pattern may contain zero or more wildcards (the character "*") that match anything.
This routine was adapted from FSMainImpl2.Match.
SubMatch: PROC [pstart: INT, plen: INT, vstart: INT, vlen: INT] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ¬ FALSE] = {
See if there is a match between Rope.Substr[value,vstart,vlen] and Rope.Substr[pattern,pstart,plen].
pchar, vchar: CHAR;
WHILE plen > 0 DO
pchar ¬ Ascii.Upper[pattern.Fetch[pstart]];
IF pchar = '* THEN {
quick kill for * at end of pattern
IF plen = 1 THEN RETURN [match: TRUE, nothingGreater: FALSE];
else must take all combinations
{ -- first, accept the *
newpstart: INT = pstart + 1;
newplen: INT = plen - 1;
newvstart: INT ¬ vstart;
newvlen: INT ¬ vlen;
WHILE newvlen >= 0 DO
IF SubMatch[newpstart, newplen, newvstart, newvlen].match THEN RETURN [match: TRUE, nothingGreater: FALSE];
newvstart ¬ newvstart + 1;
newvlen ¬ newvlen - 1;
ENDLOOP;
};
RETURN [match: FALSE, nothingGreater: FALSE];
};
IF vlen = 0 THEN RETURN [match: FALSE, nothingGreater: FALSE];
at this point demand an exact match in both strings
vchar ¬ Ascii.Upper[value.Fetch[vstart]];
IF pchar # vchar THEN RETURN [match: FALSE, nothingGreater: vchar > pchar];
pstart ¬ pstart + 1;
plen ¬ plen - 1;
vstart ¬ vstart + 1;
vlen ¬ vlen - 1;
ENDLOOP;
IF vlen = 0 THEN RETURN [match: TRUE, nothingGreater: FALSE]
ELSE RETURN [match: FALSE, nothingGreater: TRUE];
};
[match, nothingGreater] ¬ SubMatch [0, pattern.Length[], 0, value.Length[]];
pparsedNew ¬ $None;
};
NOTE: The following routines that take advantage of pre-parsed patterns do not actually check that the passed pattern corresponds to the one that was previously parsed. The caller is expected to provide a correct pparsed argument.
PRe: TYPE ~ REF PReBody;
PReBody: TYPE ~ RECORD[
pattern: ROPE,
finder: RegularExpression.Finder,
purePrefix: ROPE ¬ NIL -- used to see if something greater might match
];
RE: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
The pattern is taken to be a regular expression as defined in RegularExpressionDoc.tioga.
pr: PRe;
WITH pparsed SELECT FROM
ppr: PRe => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
pr ¬ NEW[PReBody];
pr.pattern ¬ pattern;
pr.finder ¬ RegularExpression.CreateFromRope[pattern: pattern, ignoreCase: TRUE, addBounds: TRUE];
};
pparsedNew ¬ pr;
match ¬ RegularExpression.SearchRope[pr.finder, value].found;
IF NOT match THEN { -- see if something greater might match
IF pr.purePrefix = NIL THEN {
i: INT;
i ¬ Rope.SkipTo[s: pr.pattern, skip: "\'#[^$*+(\\<{!"]; -- look for special chars
IF NOT i = Rope.Length[pr.pattern] AND Rope.Fetch[pr.pattern, i] = '* THEN -- could be zero of previous char
i ¬ i-1;
pr.purePrefix ¬ Rope.Substr[base: pr.pattern, len: i];
};
IF Rope.Compare[s1: Rope.Substr[value, 0, Rope.Length[pr.purePrefix]], s2: pr.purePrefix, case: FALSE] = greater THEN
nothingGreater ¬ TRUE;
};
};
PSound: TYPE ~ REF PSoundBody;
PSoundBody: TYPE ~ RECORD[
pattern: ROPE,
ch: CHAR,
code: Soundex.SoundexCode
];
SoundexMatch: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Compares the value and pattern based on their Soundex codes. The Soundex encoding tends to group together variants of the same name; for instance, Johnson, Jansen, and Johansen have identical Soundex codes.
pr: PSound;
WITH pparsed SELECT FROM
ppr: PSound => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
IF Rope.Equal[pattern, ""] THEN RETURN [FALSE, FALSE, NIL];
pr ¬ NEW[PSoundBody];
pr.pattern ¬ pattern;
pr.ch ¬ Ascii.Upper[Rope.Fetch[pattern,0]];
pr.code ¬ Soundex.NameToCode[pattern];
};
pparsedNew ¬ pr;
as an optimization, only compute codes if first characters match
IF Rope.Equal[value, ""] THEN RETURN [FALSE, FALSE, pparsedNew];
SELECT Ascii.Upper[Rope.Fetch[value,0]] FROM
= pr.ch => match ¬ Rope.Equal[Soundex.NameToCode[value], pr.code];
< pr.ch => match ¬ FALSE;
> pr.ch => {match ¬ FALSE; nothingGreater ¬ TRUE};
ENDCASE => ERROR;
};
SoundexPrefix: PUBLIC PatternMatch.MatchProc = {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Like Soundex except the code of the pattern need only be a prefix of the value's code. For example, "John" is a soundex prefix of "Jansen".
pr: PSound;
WITH pparsed SELECT FROM
ppr: PSound => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
IF Rope.Equal[pattern, ""] THEN RETURN [FALSE, FALSE, NIL];
pr ¬ NEW[PSoundBody];
pr.pattern ¬ pattern;
pr.ch ¬ Ascii.Upper[Rope.Fetch[pattern,0]];
pr.code ¬ Soundex.NameToCode[pattern];
pr.code ¬ Rope.Substr[base: pr.code, len: Rope.Index[s1: pr.code, s2: "0"]];
};
pparsedNew ¬ pr;
as an optimization, only compute codes if first characters match
IF Rope.Equal[value, ""] THEN RETURN [FALSE, FALSE, pparsedNew];
SELECT Ascii.Upper[Rope.Fetch[value,0]] FROM
= pr.ch => match ¬ Rope.Equal[Rope.Substr[Soundex.NameToCode[value], 0, Rope.Length[pr.code]], pr.code];
< pr.ch => match ¬ FALSE;
> pr.ch => {match ¬ FALSE; nothingGreater ¬ TRUE};
ENDCASE => ERROR;
};
ParseSubrange: PROC [r: ROPE] RETURNS [start, end: ROPE] ~ {
r should be of the form "start-end".
ENABLE IO.EndOfStream, IO.Error => GOTO Bad;
ToDash: IO.BreakProc = {
[char: CHAR] RETURNS [IO.CharClass]
RETURN[SELECT char FROM
'- => sepr,
ENDCASE => other];
};
s: IO.STREAM ¬ IO.RIS[r];
start ¬ IF IO.PeekChar[s] = '" THEN IO.GetRopeLiteral[s] ELSE IO.GetTokenRope[s, ToDash].token;
IF IO.GetChar[s] # '- THEN GOTO Bad;
end ¬ IF IO.PeekChar[s] = '" THEN IO.GetRopeLiteral[s] ELSE IO.GetLineRope[s];
EXITS
Bad => RETURN[NIL, NIL];
};
PSub: TYPE ~ REF PSubBody;
PSubBody: TYPE ~ RECORD [
pattern: ROPE,
start, end: ROPE
];
Subrange: PUBLIC PatternMatch.MatchProc ~ {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Checks if the value is in the range specified by the pattern. The pattern consists of two prefixes separated by a "-".
pr: PSub;
WITH pparsed SELECT FROM
ppr: PSub => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
pr ¬ NEW[PSubBody];
pr.pattern ¬ pattern;
[pr.start, pr.end] ¬ ParseSubrange[pattern];
IF pr.start = NIL THEN GOTO BadPattern;
IF Rope.Compare[s1: pr.start, s2: pr.end, case: FALSE] = greater THEN GOTO BadPattern;
};
pparsedNew ¬ pr;
check start <= prefix(value) <= end
SELECT Rope.Compare[s1: value, s2: pr.start, case: FALSE] FROM
less => match ¬ FALSE;
equal => match ¬ TRUE;
greater =>
SELECT Rope.Compare[s1: Rope.Substr[value, 0, Rope.Length[pr.end]], s2: pr.end, case: FALSE] FROM
less => match ¬ TRUE;
equal => match ¬ TRUE;
greater => {match ¬ FALSE; nothingGreater ¬ TRUE}
ENDCASE => ERROR;
ENDCASE => ERROR;
EXITS
BadPattern => RETURN[match: FALSE, nothingGreater: TRUE, pparsedNew: NIL];
};
PNumSub: TYPE ~ REF PNumSubBody;
PNumSubBody: TYPE ~ RECORD [
pattern: ROPE,
startInt, endInt: INT
];
NumSubrange: PUBLIC PatternMatch.MatchProc ~ {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Checks if the value is in the numerical range specified by the pattern. The pattern should consist of two positive integers separated by a "-". If the value can not be parsed as an integer then match=FALSE is returned; if the pattern is bad then match=FALSE and nothingGreater=TRUE is returned.
valueInt: INT;
pr: PNumSub;
WITH pparsed SELECT FROM
ppr: PNumSub => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
start, end: ROPE;
pr ¬ NEW[PNumSubBody];
pr.pattern ¬ pattern;
[start, end] ¬ ParseSubrange[pattern];
IF start = NIL THEN GOTO BadPattern;
pr.startInt ¬ Convert.IntFromRope[start ! Convert.Error => GOTO BadPattern];
pr.endInt ¬ Convert.IntFromRope[end ! Convert.Error => GOTO BadPattern];
IF pr.startInt > pr.endInt THEN GOTO BadPattern;
};
pparsedNew ¬ pr;
check start <= value <= end
valueInt ¬ Convert.IntFromRope[value ! Convert.Error => GOTO BadValue];
match ¬ pr.startInt <= valueInt AND valueInt <= pr.endInt;
EXITS
BadPattern => RETURN[match: FALSE, nothingGreater: TRUE, pparsedNew: NIL];
BadValue => RETURN[match: FALSE, nothingGreater: FALSE, pparsedNew: pparsedNew];
};
PDateSub: TYPE ~ REF PDateSubBody;
PDateSubBody: TYPE ~ RECORD [
pattern: ROPE,
precision: Tempus.Precision,
startDate, endDate: BasicTime.GMT
];
DateSubrange: PUBLIC PatternMatch.MatchProc ~ {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Checks if the value is in the chronological range specified by the pattern. The pattern should consist of two date and times (as can be parsed by Tempus) separated by a "-". If the value can not be parsed as a date and time then match=FALSE is returned; if the pattern is bad then match=FALSE and nothingGreater=TRUE is returned.
The endDate is adjusted to be the last possible time in the given precision whereas the startDate is the earliest possible time, e.g. today-today means a full day rather than one second.
valueDate: BasicTime.GMT;
pr: PDateSub;
WITH pparsed SELECT FROM
ppr: PDateSub => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
start, end: Rope.ROPE;
pr ¬ NEW[PDateSubBody];
pr.pattern ¬ pattern;
[start, end] ¬ ParseSubrange[pattern];
IF start = NIL THEN GOTO BadPattern;
pr.startDate ¬ Tempus.Parse[rope: start, search: TRUE ! Tempus.Unintelligible => GOTO BadPattern].time;
[pr.endDate, pr.precision] ¬ Tempus.Parse[rope: end, search: TRUE ! Tempus.Unintelligible => GOTO BadPattern];
Adjust end date
SELECT pr.precision FROM
years => pr.endDate ¬ Tempus.Adjust[baseTime: pr.endDate, years: 1, seconds: -1].time;
months => pr.endDate ¬ Tempus.Adjust[baseTime: pr.endDate, months: 1, seconds: -1].time;
days => pr.endDate ¬ Tempus.Adjust[baseTime: pr.endDate, days: 1, seconds: -1].time;
hours => pr.endDate ¬ Tempus.Adjust[baseTime: pr.endDate, hours: 1, seconds: -1].time;
minutes => pr.endDate ¬ Tempus.Adjust[baseTime: pr.endDate, minutes: 1, seconds: -1].time;
ENDCASE => NULL;
IF BasicTime.Period[from: pr.startDate, to: pr.endDate] < 0 THEN GOTO BadPattern;
};
pparsedNew ¬ pr;
check start <= value <= end
valueDate ¬ Tempus.Parse[rope: value, search: TRUE ! Tempus.Unintelligible => GOTO BadValue].time;
match ¬ BasicTime.Period[from: pr.startDate, to: valueDate] >= 0 AND BasicTime.Period[from: valueDate, to: pr.endDate] >= 0;
EXITS
BadPattern => RETURN[match: FALSE, nothingGreater: TRUE, pparsedNew: NIL];
BadValue => RETURN[match: FALSE, nothingGreater: FALSE, pparsedNew: pparsedNew];
};
PDate: TYPE ~ REF PDateBody;
PDateBody: TYPE ~ RECORD [
pattern: ROPE,
patternDate: BasicTime.GMT,
patternPrecision: Tempus.Precision
];
DateAndTime: PUBLIC PatternMatch.MatchProc ~ {
[value: ROPE, pattern: ROPE, pparsed: REFNIL] RETURNS [match: BOOLEAN, nothingGreater: BOOLEAN ← FALSE, pparsedNew: REFNIL]
Checks if the value matches the pattern within the time precision of the pattern. The pattern, as well as the value, should be a date and time (as can be parsed by Tempus). If the value can not be parsed as a date and time then match=FALSE is returned; if the pattern is bad then match=FALSE and nothingGreater=TRUE is returned. A match can not occur unless the pattern is no more precise than the value. For example, a pattern of "Wednesday" will match a value of "Wednesday at 2 pm", but not vice versa.
valueDate: BasicTime.GMT;
pr: PDate;
WITH pparsed SELECT FROM
ppr: PDate => pr ¬ ppr;
ENDCASE => { -- pparsed is wrong type or NIL
pr ¬ NEW[PDateBody];
pr.pattern ¬ pattern;
[pr.patternDate, pr.patternPrecision] ¬ Tempus.Parse[rope: pattern, search: TRUE ! Tempus.Unintelligible => GOTO BadPattern];
};
pparsedNew ¬ pr;
check value = pattern
valueDate ¬ Tempus.Parse[rope: value, search: TRUE ! Tempus.Unintelligible => GOTO BadValue].time;
valueDate ¬ Tempus.Adjust[baseTime: valueDate, precisionOfResult: pr.patternPrecision].time;
match ¬ BasicTime.Period[from: pr.patternDate, to: valueDate] = 0;
EXITS
BadPattern => RETURN[match: FALSE, nothingGreater: TRUE, pparsedNew: NIL];
BadValue => RETURN[match: FALSE, nothingGreater: FALSE, pparsedNew: pparsedNew];
};
DWIM: PUBLIC PROC [pattern: ROPE] RETURNS [ptype: ROPE] ~ {
Trys to deduce an appropriate filter type from the given pattern.
IsInt: PROC [r: ROPE] RETURNS [BOOLEAN ¬ TRUE] ~ {
[] ¬ Convert.IntFromRope[r ! Convert.Error => GOTO NotInt];
EXITS
NotInt => RETURN[FALSE];
};
IsTime: PROC [r: ROPE] RETURNS [BOOLEAN ¬ TRUE] ~ {
[] ¬ Tempus.Parse[rope: r, search: TRUE ! Tempus.Unintelligible => GOTO NotTime];
EXITS
NotTime => RETURN[FALSE];
};
start, end: ROPE;
[start, end] ¬ ParseSubrange[pattern];
IF NOT (start=NIL AND end=NIL)
THEN ptype ¬ SELECT TRUE FROM -- determine what type of subrange
IsTime[start] AND IsTime[end] => "daterange",
IsInt[start] AND IsInt[end] => "numrange",
ENDCASE => "subrange"
ELSE ptype ¬ SELECT TRUE FROM -- not a subrange
Rope.Find[pattern, "*"]#-1 => "wildcard",
Rope.Length[pattern]>2 AND IsTime[pattern] => "date",
ENDCASE => "prefix";
};
CheckPattern: PUBLIC PROC [pattern: ROPE, proc: PatternMatch.MatchProc] RETURNS [ok: BOOLEAN ¬ TRUE, info: ROPE ¬ NIL] ~ {
Checks if the given pattern is suitable for input to the given pattern matcher. For instance, proc=DateSubrange returns ok=TRUE only if the pattern is a valid date range. The returned info is a human readable analysis of the pattern.
pparsed: REF;
IF proc=NIL THEN RETURN[TRUE, NIL];
pparsed ¬ proc[value: "", pattern: pattern].pparsedNew;
ok ¬ pparsed#NIL;
WITH pparsed SELECT FROM
pdsub: PDateSub => info ¬ Rope.Cat[Tempus.MakeRope[time: pdsub.startDate, precision: pdsub.precision], "-", Tempus.MakeRope[time: pdsub.endDate, precision: seconds]];
pdate: PDate => info ¬ Tempus.MakeRope[time: pdate.patternDate, precision: pdate.patternPrecision];
ENDCASE => info ¬ NIL;
};
Pattern matching registration mechanism.
matchTable: SymTab.Ref;
Lookup: PUBLIC PROC [name: ROPE] RETURNS [proc: PatternMatch.MatchProc] ~ {
Get a pattern matcher by name.
pr: REF PatternMatch.MatchProc ¬ NARROW[SymTab.Fetch[matchTable, name].val];
RETURN[IF pr#NIL THEN pr­ ELSE NIL];
};
Register: PUBLIC PROC [name: ROPE, proc: PatternMatch.MatchProc] ~ {
Register a name for a pattern matcher.
IF proc#NIL
THEN [] ¬ SymTab.Store[matchTable, name, NEW[PatternMatch.MatchProc ¬ proc]]
ELSE [] ¬ SymTab.Delete[matchTable, name];
};
Pairs: PUBLIC PROC [action: PatternMatch.EachPairAction] RETURNS [BOOL] ~ {
... enumerates pairs currently in the pattern registry in unspecified order; applies action to each pair until action returns TRUE or no more pairs; returns TRUE if some action returns TRUE.
Action: SymTab.EachPairAction = {
[key: ROPE, val: SymTab.Val] RETURNS [quit: BOOL ← FALSE]
pr: REF PatternMatch.MatchProc ¬ NARROW[val];
RETURN[action[key, pr­]];
};
RETURN[SymTab.Pairs[matchTable, Action]];
};
There should be only one refIDTable per machine, whether or not the package is run more than once.
matchTable ¬ NARROW[Atom.GetProp[$PatternMatch, $matchTable]];
IF matchTable = NIL THEN {
matchTable ¬ SymTab.Create[case: FALSE];
Atom.PutProp[$PatternMatch, $matchTable, matchTable];
};
Registrations
Register["exact", Equal];
Register["prefix", Prefix];
Register["wildcard", Wildcard];
Register["re", RE];
Register["soundexact", SoundexMatch];
Register["soundex", SoundexPrefix];
Register["subrange", Subrange];
Register["numrange", NumSubrange];
Register["daterange", DateSubrange];
Register["date", DateAndTime];
END.
Doug Terry, April 14, 1987 1:28:07 pm PDT
Extracted from LoganQueryImpl and LoganBerryBrowserImpl; replaced single-element caches with pparsed arguments.
changes to: DIRECTORY, PatternMatchImpl, EXPORTS, ~, Equal, Prefix, Wildcard, RE, Soundex, Subrange, NumSubrange, DateSubrange, DateAndTime
Doug Terry, July 20, 1987 8:40:49 pm PDT
Added registration mechanism for pattern matchers; moved DWIM and CheckPattern from LoganQuery.
changes to: DIRECTORY, Equal, Prefix, Wildcard, ~, DateSubrange, DateAndTime, DWIM, CheckPattern, Register