Spelling Corrector Edited by Teitelman on March 24, 1983 4:24 pm
This program accepts an input string and a list of candidate
strings and returns either NIL or the corrected string
DIRECTORY
Generator USING [CreateGenerator, ReStart, Terminate, Generate, Handle],
List USING [DReverse],
Rope USING [Find, Fetch, Match, Equal, IsEmpty, Length, Cat, Compare, Flatten, FromRefText, Letter, ROPE, Text],
RopeInline USING [InlineFlatten],
SafeStorage USING [NarrowRefFault],
ShowTime USING [Microseconds, GetMark, SinceMark],
Spell USING [ROPE, Modes, ModesRecord, SpellingList, SpellingGenerator, AbortProc, InformProc, ConfirmProc, Filter, CorrectionClass, SpellingGeneratorRecord],
UserProfile USING [CallWhenProfileChanges, ProfileChangedProc, Token, Number, Boolean]
;
SpellImpl: CEDAR MONITOR
IMPORTS List, Rope, RopeInline, ShowTime, UserProfile, Generator, SafeStorage
EXPORTS Spell
SHARES Rope
-- to be able to say text.length
= BEGIN OPEN Spell;
constants
fastypeflg: BOOLEAN = TRUE;
Percentage: TYPE = REAL;
howClose: REAL = .7;
modes
defaultModes: PUBLIC Modes ← NEW[ModesRecord ←
[inform: NULL,
confirm: NULL,
disabled: NULL,
timeout: NULL,
defaultConfirm: NULL
]];
SetModes: UserProfile.ProfileChangedProc = CHECKED {
GetCorrectionClass: SAFE PROC [key: ROPE, default: ROPE, rope: ROPENIL] RETURNS[CorrectionClass] = {
r: ROPE = IF rope # NIL THEN rope ELSE UserProfile.Token[key, default];
FOR l: LIST OF RECORD[rope: ROPE, class: CorrectionClass] ← LIST[["never", never], ["someMistakes", someMistakes], ["allAccountedFor", allAccountedFor], ["caseError", caseError], ["patternMatch", patternMatch], ["always", always]], l.rest UNTIL l = NIL DO
IF Rope.Equal[r, l.first.rope, FALSE] THEN RETURN[l.first.class];
ENDLOOP;
IF rope = NIL THEN RETURN[GetCorrectionClass[key, default, default]]; -- user has somethig else in the profile, return the default. PRINT A MESSAGE?
ERROR; -- shouldnt happen
};
modes: REF ModesRecord;
i: INT;
TRUSTED {modes ← LOOPHOLE[defaultModes]}; -- defaultmodes is REF READONLY
modes.inform ← GetCorrectionClass["Spell.inform", "allAccountedFor"];
modes.confirm ← GetCorrectionClass["Spell.confirm", "allAccountedFor"];
modes.disabled ← GetCorrectionClass["Spell.disabled", "never"];
modes.timeout ← UserProfile.Number["Spell.Timeout", -1];
modes.defaultConfirm ← UserProfile.Boolean["Spell.defaultConfirm", FALSE];
i ← UserProfile.Number["Spell.giveUpAfter" , -1];
IF i > 0 THEN giveUpAfter ← (i * 1000) ELSE giveUpAfter ← 0; -- giveUpAfter is in milliseconds in profile, but need it in microseconds to compare using ShowTime.SinceMark.
};
Pattern Matching
FOR NOW, UNTIL TIOGA PATTERN MATCHER AVAILABLE
IsAPattern: PUBLIC PROC [unknown: ROPE] RETURNS[BOOL] = {
i: INT ← 0;
WHILE (i ← Rope.Find[s1: unknown, s2: "*", pos1: i]) # -1 DO
IF i = 0 OR Rope.Fetch[unknown, i - 1] # '' THEN RETURN[TRUE];
i ← i + 1;
ENDLOOP;
RETURN[FALSE];
}; -- of IsAPattern
Match: PROC [pattern, object: ROPE] RETURNS[BOOL] = {
RETURN[Rope.Match[ pattern: pattern, object: object, case: FALSE]];
};
CaseEqual: PROCEDURE [x, y: ROPE] RETURNS[BOOLEAN] = INLINE {
RETURN[Rope.Compare[x, y, FALSE] = equal];
}; -- of CaseEqual
GetTheOne, GetMatchingList
GetTheOne: PUBLIC PROCEDURE
[
unknown: ROPE,
spellingList: SpellingList ← NIL,
generator: SpellingGenerator ← NIL,
abort: AbortProc ← NIL,-- abort = NIL => never abort.
confirm: ConfirmProc ← NIL, -- confirm = NIL => not confirmed.
inform: InformProc ← NIL,-- inform = NIL => no output.
filter: Filter ← NIL,
modes: Modes ← NIL
]
RETURNS [ROPE] = {
correct, r: ROPE;
correctionClass: CorrectionClass;
IF Rope.IsEmpty[unknown] THEN RETURN[NIL];
IF modes = NIL THEN modes ← defaultModes;
IF IsAPattern[unknown] THEN
{l: LIST OF ROPE;
IF modes.disabled >= patternMatch THEN RETURN[NIL];
l ← GetMatches[unknown: unknown, spellingList: spellingList, generator: generator, abort: abort, filter: filter];
IF l # NIL AND l.rest = NIL THEN RETURN[l.first] ELSE RETURN[NIL]
};
IF Rope.Length[unknown] = 0 OR modes.disabled = always THEN RETURN[NIL];
[correct, correctionClass] ← Choose[unknown: unknown, spellingList: spellingList, generator: generator, filter: filter, abort: abort];
IF correct = NIL OR correctionClass <= modes.disabled OR Rope.Equal[correct, unknown] THEN RETURN[NIL];
r ← Rope.Cat[unknown, " -> ", correct];
IF correctionClass <= modes.confirm THEN
RETURN[IF confirm # NIL AND confirm[msg: r, timeout: modes.timeout, defaultConfirm: modes.defaultConfirm] THEN correct ELSE NIL];
IF correctionClass <= modes.inform AND inform # NIL THEN inform[r];
RETURN[correct];
};
GetMatchingList: PUBLIC PROCEDURE
[
pattern: ROPE,
spellingList: SpellingList ← NIL,
generator: SpellingGenerator ← NIL,
abort: AbortProc ← NIL,
confirm: ConfirmProc ← NIL,
inform: InformProc ← NIL,
filter: Filter ← NIL,
modes: Modes ← NIL
]
RETURNS [LIST OF ROPE] = {
values: LIST OF ROPE;
IF Rope.IsEmpty[pattern] THEN RETURN[NIL];
IF modes = NIL THEN modes ← defaultModes;
IF ~IsAPattern[pattern] THEN
{x: ROPE = GetTheOne[unknown: pattern, spellingList: spellingList, generator: generator, abort: abort, confirm: confirm, inform: inform, filter: filter, modes: modes];
IF x # NIL THEN RETURN[LIST[x]] ELSE RETURN[NIL];
};
IF modes.disabled >= patternMatch THEN RETURN[NIL];
values ← GetMatches[unknown: pattern, spellingList: spellingList, generator: generator, abort: abort, filter: filter];
IF values = NIL THEN RETURN[NIL];
IF modes.confirm >= patternMatch THEN
{r: ROPE;
FOR lst: LIST OF ROPE ← values, lst.rest UNTIL lst = NIL DO
r ← Rope.Cat[r, lst.first, " "];
ENDLOOP;
IF confirm # NIL AND confirm[msg: Rope.Cat[pattern, " -> ", r], timeout: modes.timeout, defaultConfirm: modes.defaultConfirm] THEN RETURN[values]
ELSE RETURN[NIL];
};
IF inform # NIL AND modes.inform >= patternMatch THEN
{r: ROPE;
FOR lst: LIST OF ROPE ← values, lst.rest UNTIL lst = NIL DO
r ← Rope.Cat[r, lst.first, " "];
ENDLOOP;
inform[msg: Rope.Cat[pattern, " -> ", r]] ;
};
RETURN[values];
};
Choose, Score
giveUpAfter: ShowTime.Microseconds; -- microSeconds
startedAt: ShowTime.Microseconds;
Choose: PROCEDURE[
unknown: ROPE,
spellingList: SpellingList,
generator: SpellingGenerator,
filter: Filter ← NIL,
abort: AbortProc ← NIL
] RETURNS [val: ROPE, correctionClass: CorrectionClass] = {
private: REF SpellingGeneratorPrivateRecord ← IF generator = NIL THEN NIL ELSE generator.private;
Choose0: PROC = {
lst: LIST OF ROPENIL;
candidate: REF ANY;
tword: Rope.Text;
nctword, ncxword, sc: NAT;
close: BOOLEAN;
howClose1, tem: Percentage ← howClose;
MakeRope: PROC [candidate: REF ANY] RETURNS [text: Rope.ROPE] = {
WITH candidate SELECT FROM
t: REF TEXT => RETURN[Rope.FromRefText[t]];
r: Rope.ROPE => RETURN[r];
ENDCASE => ERROR; -- should already have determined that it is either a REF TEXT or ROPE
};
IF giveUpAfter > 0 THEN startedAt ← ShowTime.GetMark[];
DO
IF abort # NIL THEN abort[];
IF giveUpAfter > 0 AND ShowTime.SinceMark[from: startedAt] > giveUpAfter THEN EXIT;
IF generator # NIL THEN
TRUSTED {
candidate ← private.generate[generator];
IF candidate = NIL THEN EXIT;
WITH candidate SELECT FROM
t: REF TEXT => tword ← LOOPHOLE[t, Rope.Text]; -- will be copied if it is going to be retained.
r: Rope.Text => tword ← r;
r: Rope.ROPE => tword ← RopeInline.InlineFlatten[r];
ENDCASE => ERROR SafeStorage.NarrowRefFault[ref: candidate, targetType: CODE[Rope.ROPE]];
}
ELSE IF spellingList = NIL THEN EXIT
ELSE
{
tword ← RopeInline.InlineFlatten[spellingList.first];
candidate ← tword;
spellingList ← spellingList.rest;
};
nctword ← tword.length;
ncxword ← xword.length;
IF (IF nctword > ncxword THEN (ncxword + 0.0) / nctword < howClose1
ELSE (nctword + 0.0) / (ncxword - ndbls) < howClose1)
THEN LOOP; -- Checks to see if test word and unknown word differ sufficiently in number of characters so as to make it unnecessary to even call score. First case is where test word is longer than xword. If number of characters in xword, divided by number of characters in twest word is less than REL, then dont bother. Second case where xword is longer than test word, must allow for possibility of dobuled characters.
[close, sc, ncxword] ← Score[xword, tword];  
SELECT TRUE FROM
~ close => LOOP;
filter # NIL AND ~filter[candidateRope: tword, unknown: unknown] => LOOP;
sc = 0 => {
val ← MakeRope[candidate];
IF CaseEqual[val, unknown] THEN correctionClass ← caseError ELSE correctionClass ← allAccountedFor;
RETURN;
};
(tem ← Choose1[xword.length, tword.length, sc]) > howClose1 => {
lst ← CONS[MakeRope[candidate], NIL];
howClose1 ← tem
};
sc = howClose1 => lst ← CONS[MakeRope[candidate], lst]; -- might be something better later on
ENDCASE;
ENDLOOP; -- end of main loop
IF lst # NIL AND lst.rest = NIL THEN {
val ← lst.first;
correctionClass ← someMistakes;
};
}; -- of Choose0
isAPattern: BOOL = IsAPattern[unknown];
xword: Rope.Text ;
c: CHARACTER;
ndbls: NAT ← 0;
xword ← RopeInline.InlineFlatten[unknown];
IF unknown = NIL OR (spellingList = NIL AND generator = NIL) THEN RETURN [NIL, never];
c ← xword[0];
FOR i: NAT IN [1 ..xword.length) DO
IF c = xword[i] THEN ndbls ← ndbls + 1;
c ← xword[i]; -- computes number of double characters in string.
ENDLOOP;
IF generator # NIL THEN
{IF private.initialize # NIL THEN private.initialize[generator]};
Choose0[
!
ABORTED =>
IF generator # NIL AND private.terminate # NIL THEN private.terminate[generator];
UNWIND => IF generator # NIL AND private.terminate # NIL THEN private.terminate[generator];
];
IF generator # NIL AND private.terminate # NIL THEN private.terminate[generator];
}; -- of Choose
Choose1: PROCEDURE[a, b, sc: NAT] RETURNS [REAL] = INLINE {
computes the relative closeness as a percentage (times 100) by dividing the difference
between the average number of characters and the number of mistakes,
over the average number of characters. This is (((a + b)/2)-sc)/ (a + b)/2
Multiplying top and bottom by two gives (a + b-2*sc)/(a + b))
RETURN[(a + b - 2 * sc + 0.0) / (a + b)];
}; -- of Choose1
Score: PROCEDURE[xword, tword: Rope.Text]
RETURNS[close: BOOLEAN, mistakes: NAT, ncxword: NAT] = {
xpos, tpos: NAT ← 0; -- current character position in xword,tword
xc, tc: CHARACTER; -- current character in xword, tword, equal to xword[xpos],tword[tpos] but used to improve readability
xclast: CHARACTER; -- last character looked at in xword. used for double character detection.
empty: NAT = LAST[NAT]; --indicates a buffer is empty
x1, x2, t1, t2: NAT ← empty; -- position of characters that did not match in xword, tword respectively. These are effectively single character buffers.
n, ntrans: NAT ← 0; -- number of mistakes, number of transpositions
CheckCharBuffer: PROCEDURE[currchar: CHARACTER, word: Rope.Text, buffer, currpos: NAT] RETURNS[BOOLEAN] = INLINE {
IF buffer # empty AND
(currchar = word[buffer] OR currchar = word[buffer] + 32 AND
currchar >= 'a OR word[buffer] = currchar + 32 AND word[buffer] >= 'a)
THEN
{IF currpos -buffer <= 3 THEN ntrans ← ntrans + 1 -- count it as a transposition
ELSE n ← n + 1; -- shifted more than two characters, count it as an error
RETURN[TRUE]
};
RETURN[FALSE];
}; -- of CheckCharBuffer
NotAlpha: PROC[word: Rope.Text, buffer: NAT] RETURNS[BOOLEAN] = INLINE
{RETURN[NOT Rope.Letter[word[buffer]]];
};
ncxword ← xword.length;
DO -- main body of score is one big loop
SELECT TRUE FROM
xpos = xword.length => IF tpos = tword.length THEN EXIT ELSE tc ← tword[tpos]; --otherwise drop thru and check single character buffers
tpos = tword.length => xc ← xword[xpos]; -- drop through to check xc against tbuffers
(xc ← xword[xpos]) = (tc ← tword[tpos])
OR (SELECT tc FROM
IN['!..'/] => xc = tc + 16, --!../ to corresponding unshifted characters
IN['1..'?] => xc = tc - 16, -- 1..? to corresponding shifted characters
ENDCASE => FALSE)
OR xc = tc + 32 AND xc >= 'a -- xc is lower case equivalent of tc
OR tc = xc + 32 AND tc >= 'a -- tc is lower case equivalent of xc
OR tc = '1 AND (xc = 'L OR xc = 'l) -- misreading of manual
=>
{xpos ← xpos + 1; tpos ← tpos + 1; xclast ← xc; LOOP};
ENDCASE;
at this point, the current characters in xword and tword do not match (or there isnt any more of tword). so now we check the character against the single character buffers for tword.
SELECT TRUE FROM
xpos = xword.length => NULL;
CheckCharBuffer[buffer: t2, currchar: xc, word: tword, currpos: tpos] => -- character encountered in tword before xword, e.g. the P in IPRNT vs PRINT. The case of RPINT vs PRINT is handled specially without ever going to the buffers
{t2 ← empty; xpos ← xpos + 1; xclast ← xc; LOOP};
CheckCharBuffer[buffer: t1, currchar: xc, word: tword, currpos: tpos] =>
{IF t2 # empty THEN {t1 ← t2; t2 ← empty}
ELSE t1 ← empty;
xpos ← xpos + 1;
xclast ← xc;
LOOP;
};
ENDCASE;
check current character in tword against xbuffers
SELECT TRUE FROM
tpos = tword.length => NULL;
CheckCharBuffer[buffer: x2, currchar: tc, word: xword, currpos: xpos] => -- character encountered in xword before tword, e.g. the I in IPRNT vs PRINT.
{x2 ← empty; tpos ← tpos + 1; LOOP};
CheckCharBuffer[buffer: x1, currchar: tc, word: xword, currpos: xpos] =>
{IF x2 # empty THEN {x1 ← x2; x2 ← empty}
ELSE x1 ← empty;
tpos ← tpos + 1;
xclast stays the same
LOOP;
};
xpos
+ 1 < xword.length AND tpos + 1 < tword.length
AND xc = tword[tpos + 1] AND tc = xword[xpos + 1]
AND (tpos + 2 = tword.length OR tc # tword[tpos + 2]) =>
special check for most common case of transposition. the last clause is an attempt to distinguish the case of a transposition from simply getting out of synch, e.g. consider MYCIN vs MICIN. The Y is discarded, and then we are comparing CIN with ICIN. Treating CI as a transposition of IC is wrong in this case, since it matches with CI if the I is discarder
{xclast ← xword[xpos + 1];
xpos ← xpos + 2;
tpos ← tpos + 2;
ntrans ← ntrans + 1;
LOOP;
};
tword.length - tpos > xword.length - xpos => -- more characters left in tword than xword, remove character from tword
SELECT TRUE FROM
t1 = empty =>
{t1 ← tpos; -- save character in t1
tpos ← tpos + 1;
LOOP};
t2 = empty =>
{t2 ← tpos;
tpos ← tpos + 1;
LOOP};
ENDCASE => GOTO Failed; -- too many errors. abort
ENDCASE;  
IF xc = xclast OR (xpos + 1 < xword.length AND xc = xword[xpos + 1]) THEN -- double characters
{xpos ← xpos + 1;
ncxword ← ncxword - 1;
} -- When computing value of score, want to divide number of mistakes by actual lenghth of word i.e. length minus number of doubled characters. Otehrwise, making a word longer by adding extra characters will make it closer, e.g. ZZZZZZ would correct to PP.
ELSE
{IF x1 = empty THEN x1 ← xpos
ELSE IF x2 = empty THEN x2 ← xpos
ELSE GOTO Failed;
xpos ← xpos + 1;
};
ENDLOOP;
finished scanning both words, decide on score
IF x2 # empty THEN {
IF NotAlpha[xword, x2] THEN GOTO Failed; -- e.g. if unknown contains a $, and thats not accounted for, then shouldnt correct.
n ← n +2;
}
ELSE IF x1 # empty THEN
{IF NotAlpha[xword, x1] THEN GOTO Failed;
n ← n +1;
};
IF t2 # empty THEN
{IF NotAlpha[tword, t2] THEN GOTO Failed;
n ← n +2;
}
ELSE IF t1 # empty THEN
{IF NotAlpha[tword, t1] THEN GOTO Failed;
n ← n +1};
IF xpos = xword.length AND tpos = tword.length AND t1 # empty AND x1 # empty
THEN
{IF t1 = x1 OR t1 = x2 THEN n ← n-1;
IF t2 # empty AND (t2 = x1 OR t2 = x2) THEN n ← n-1;
}; -- check for substitution errors. subtracts one so net effect is only counted as one.
RETURN[TRUE, (IF n <= 1 THEN n ELSE n + ntrans), ncxword] ; -- count transpositions as errors if more than one other mistake. (maybe should go to fractional errors?)
EXITS
Failed => RETURN[FALSE, 0, 0];
}; -- of Score
GetMatches
GetMatches: PROCEDURE[
unknown: ROPE,
spellingList: SpellingList ← NIL,
generator: SpellingGenerator ← NIL,
filter: Filter ← NIL,
abort: AbortProc ← NIL
] RETURNS [lst: LIST OF ROPE] = {
private: REF SpellingGeneratorPrivateRecord ← IF generator = NIL THEN NIL ELSE generator.private;
GetMatches0: PROC = {
candidate: REF ANY;
tword: Rope.Text;
MakeRope: PROC [candidate: REF ANY] RETURNS [text: Rope.ROPE] = {
WITH candidate SELECT FROM
t: REF TEXT => RETURN[Rope.FromRefText[t]];
r: Rope.ROPE => RETURN[r];
ENDCASE => ERROR; -- should already have determined that it is either a REF TEXT or ROPE
};
main body of GetMatches0
DO
IF abort # NIL THEN abort[];
IF generator # NIL THEN
{
candidate ← private.generate[generator];
IF candidate = NIL THEN EXIT;
WITH candidate SELECT FROM
t: REF TEXT => TRUSTED {tword ← LOOPHOLE[t, Rope.Text]}; -- will be copied if it is going to be retained.
r: Rope.Text => tword ← r;
r: Rope.ROPE => tword ← Rope.Flatten[r];
ENDCASE => ERROR SafeStorage.NarrowRefFault[ref: candidate, targetType: CODE[Rope.ROPE]];
}
ELSE IF spellingList = NIL THEN EXIT
ELSE {candidate ← tword ← RopeInline.InlineFlatten[spellingList.first]; spellingList ← spellingList.rest};
IF Match[xword, tword] THEN
{IF filter # NIL AND ~filter[candidateRope: tword, unknown: unknown] THEN LOOP;
lst ← CONS[MakeRope[candidate], lst];
};
ENDLOOP;
TRUSTED {
IF lst # NIL AND lst.rest # NIL THEN lst ← LOOPHOLE[List.DReverse[LOOPHOLE[lst, LIST OF REF ANY]], LIST OF ROPE];
};
}; -- of GetMatches0
xword: Rope.Text;
xword ← RopeInline.InlineFlatten[unknown];
IF unknown = NIL OR (spellingList = NIL AND generator = NIL) THEN RETURN [NIL];
IF generator # NIL THEN
{IF private.initialize # NIL THEN private.initialize[generator]};
GetMatches0[];
IF generator # NIL THEN
{IF private.terminate # NIL THEN private.terminate[generator]};
}; -- of GetMatches
generators
connecting concrete and opaque types
SpellingGeneratorPrivateRecord: PUBLIC TYPE = RECORD[
initialize: PROCEDURE[self: SpellingGenerator] ← NIL,
generate: PROCEDURE [self: SpellingGenerator] RETURNS [candidate: REF ANY],
terminate: PROCEDURE[self: SpellingGenerator] ← NIL -- to be called when finished.
];
GeneratorFromProcs: PUBLIC PROC
[
initialize: PROCEDURE[self: SpellingGenerator] ← NIL,
generate: PROCEDURE [self: SpellingGenerator] RETURNS [candidate: REF ANY],
terminate: PROCEDURE[self: SpellingGenerator] ← NIL, -- to be called when finished.
clientData: REF ANYNIL
]
RETURNS [SpellingGenerator] = {
RETURN[NEW[Spell.SpellingGeneratorRecord ←
[
clientData: clientData,
private: NEW[SpellingGeneratorPrivateRecord ←
[
initialize: initialize,
generate: generate,
terminate: terminate
]
]
]
]];
};
GeneratorFromEnumerator: PUBLIC PROC [enumerator: PROC[self: Generator.Handle],  clientData: REF ANY] RETURNS [SpellingGenerator] = {
RETURN[NEW[Spell.SpellingGeneratorRecord ←
[
clientData: NEW[EnumeratorRecord ← [enumerator: enumerator, origClientData: clientData]],
private: NEW[SpellingGeneratorPrivateRecord ←
[
initialize: Initialize,
generate: Generate,
terminate: Terminate
]
]
]
]]
};
EnumeratorRecord: TYPE = RECORD
[
enumerator: PROC [self: Generator.Handle],
origClientData: REF ANY, -- the client data that was the argument to GeneratorFromEnumerator. Gets put into the clientData field of the generator.
generator: Generator.Handle ← NIL -- not created until generator is actually used the first time.
];
Initialize: PROC [self: SpellingGenerator] = {
state: REF EnumeratorRecord ← NARROW[self.clientData];
IF state.generator = NIL THEN state.generator ← Generator.CreateGenerator[state.enumerator, state.origClientData]
ELSE Generator.ReStart[state.generator];
}; -- of Initialize
Generate: PROC [self: SpellingGenerator] RETURNS[ROPE] = {
state: REF EnumeratorRecord ← NARROW[self.clientData];
val: REF ANY = Generator.Generate[state.generator];
RETURN[IF val = state.generator THEN NIL ELSE NARROW[val, ROPE]];
};
Terminate: PROC [self: SpellingGenerator] = {
state: REF EnumeratorRecord ← NARROW[self.clientData];
Generator.Terminate[state.generator];
};
Note: file correction is implemented in FileSpellImpl because it needs a separate monitor.
UserProfile.CallWhenProfileChanges[SetModes];
END.