Parse:
PUBLIC
PROCEDURE [s:
LONG
STRING]
RETURNS [dt: System.GreenwichMeanTime, notes: DateAndTimeUnsafe.Notes, length: NAT] = {
maxReasonableTokens: NAT = 30;
TokenCount: TYPE = NAT[0..maxReasonableTokens];
TokenIndex: TYPE = NAT[FIRST[TokenCount]..LAST[TokenCount]);
CharIndex: TYPE = NAT;
CharCount: TYPE = NAT[0..37777B]; -- this upper bound saves a word in Token
Tokens:
TYPE =
RECORD [
nTokens: TokenCount ← 0,
tokens: SEQUENCE length: TokenCount OF Token ← NULL];
Token:
TYPE =
RECORD [
offset: CharIndex,
kind:
SELECT type: *
FROM
alpha => [length: CharCount],
num => [length: CharCount],
sep => [char: CHARACTER],
ENDCASE];
input: LONG POINTER TO Tokens ← Heap.systemZone.NEW[Tokens[maxReasonableTokens] ← []];
sentinel: CHARACTER = 200C;
Bogus: PROCEDURE [ti: TokenIndex] = {ERROR Unintelligible[input.tokens[ti].offset]};
AddToken:
PROCEDURE [t: Token] = {
If there are more than maxReasonableTokens, each subsequent call to this procedure will overwrite the last token stored. Eventually, the scan will complete and the final token will be a sentinel.
IF input.nTokens = input.length THEN input.tokens[input.nTokens-1] ← t
ELSE {input.tokens[input.nTokens] ← t; input.nTokens ← input.nTokens + 1};
};
Tokenize:
PROCEDURE = {
state: {initial, num, alpha} ← initial;
i: NAT ← 0;
tStart: NAT;
AddToken[[0, sep[sentinel]]];
DO
char: CHARACTER;
SELECT
TRUE
FROM
s = NIL, i = s.length => char ← sentinel;
s[i] = Ascii.TAB => char ← Ascii.SP;
s[i] IN [40C..176C] => char ← s[i];
ENDCASE => --illegal; terminate scan-- char ← sentinel;
SELECT char
FROM
IN ['0..'9] =>
SELECT state
FROM
initial => {state ← num; tStart ← i};
num => NULL;
alpha => {AddToken[[tStart, alpha[i-tStart]]]; state ← num; tStart ← i};
ENDCASE;
',, '., ':, '/, '-, '+, Ascii.
SP, sentinel => {
SELECT state
FROM
initial => NULL;
num => {AddToken[[tStart, num[i-tStart]]]; state ← initial};
alpha => {AddToken[[tStart, alpha[i-tStart]]]; state ← initial};
ENDCASE;
The effect of the following is to ignore spaces as separators unless there is nothing else, and to terminate the scan if two separator characters are adjacent.
WITH t: input.tokens[input.nTokens-1]
SELECT
FROM
sep =>
IF t.char = Ascii.SP THEN t.char ← char
ELSE IF char ~= Ascii.SP THEN --illegal-- t.char ← char ← sentinel;
ENDCASE => AddToken[[i, sep[char]]];
IF char = sentinel THEN EXIT};
ENDCASE =>
SELECT state
FROM
initial => {state ← alpha; tStart ← i};
num => {AddToken[[tStart, num[i-tStart]]]; state ← alpha; tStart ← i};
alpha => NULL;
ENDCASE;
i ← i + 1;
ENDLOOP;
year: Year;
month: Month;
day: Day;
hour: Hour;
minute: Minute;
second: Second;
zoneAdjust: DeltaMinutes;
dst: BOOLEAN;
cumDays:
ARRAY [
FIRST[Month]-1..
LAST[Month]]
OF
NAT = [
0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366];
OutOfRange: ERROR [errorPos: NAT] = CODE;
CollectValue:
PROCEDURE [t: num Token, low, high:
NAT]
RETURNS [value:
NAT ← 0] = {
FOR j: CharIndex
IN [t.offset..t.offset+t.length)
DO
We would like to catch a bounds fault here and turn it into OutOfRange. Unfortunately, the current implementation of catch phrases doesn't support this, since a catch phrase is only attached to procedure calls. Accordingly, we have to do the test by hand. The actual test is a bit too stringent, but it doesn't really matter.
IF value > LAST[NAT]/10 - 1 THEN ERROR OutOfRange[j];
value ← value * 10 + (s[j] - '0);
ENDLOOP;
IF ~(value IN [low..high]) THEN ERROR OutOfRange[t.offset+t.length];
};
ParseDate:
PROCEDURE [first: TokenIndex]
RETURNS [next: TokenIndex] = {
BogusMonth: ERROR = CODE;
ParseSymbolicMonth:
PROCEDURE [t: alpha Token]
RETURNS [month: Month] = {
MonthNames: TYPE = ARRAY Month OF STRING;
ss: String.SubStringDescriptor ← [base: s, offset: t.offset, length: t.length];
SELECT String.UpperCase[s[t.offset]]
FROM
'I, 'V, 'X => {
roman: MonthNames ← [
"I"L, "II"L, "III"L, "IV"L, "V"L, "VI"L, "VII"L, "VIII"L, "IX"L, "X"L, "XI"L, "XII"L];
FOR month
IN Month
DO
mm: String.SubStringDescriptor ←
[base: roman[month], offset: 0, length: roman[month].length];
IF String.EquivalentSubStrings[@mm, @ss] THEN EXIT;
REPEAT
FINISHED => ERROR BogusMonth;
ENDLOOP
};
ENDCASE => {
english: MonthNames ← [
"January"L, "February"L, "March"L, "April"L, "May"L, "June"L,
"July"L, "August"L, "September"L, "October"L, "November"L, "December"L];
IF t.length < 3 THEN ERROR BogusMonth;
FOR month
IN Month
DO
mm: String.SubStringDescriptor ←
[base: english[month], offset: 0, length: t.length];
IF String.EquivalentSubStrings[@mm, @ss] THEN EXIT;
REPEAT
FINISHED => ERROR BogusMonth;
ENDLOOP
};
};
CollectYear:
PROCEDURE [y: num Token]
RETURNS [Year] = {
RETURN[1900+CollectValue[y, FIRST[Year]-1900, 99 ! OutOfRange => CONTINUE]];
RETURN[CollectValue[y, FIRST[Year], LAST[Year]]]
};
CollectMonth:
PROCEDURE [m: num Token]
RETURNS [Month] =
INLINE
{RETURN[CollectValue[m, FIRST[Month], LAST[Month]]]};
CollectDay:
PROCEDURE [d: num Token, m: Month]
RETURNS [Day] =
INLINE
{RETURN[CollectValue[d, FIRST[Day], cumDays[m]-cumDays[m-1]]]};
DO
ENABLE OutOfRange => ERROR Unintelligible[errorPos];
second, third: TokenIndex;
t: num Token;
GetThird:
PROCEDURE = {
WITH token: input.tokens[third]
SELECT
FROM
num => t ← token;
ENDCASE => Bogus[third]
};
IF input.nTokens < first + 3 OR input.tokens[first].type = sep THEN Bogus[first];
Assert: 'first' is alpha or num and at least two tokens follow it.
IF input.tokens[second𡤏irst+1].type = sep THEN second ← second + 1;
Assert: 'second' is alpha or num.
Ergo, second+1 < input.nTokens since input.tokens[input.nTokens-1].type = sep
IF input.tokens[third←second+1].type = sep
THEN {
IF third = input.nTokens-1 THEN Bogus[third];
third ← third + 1};
Assert: 'third' is alpha or num. If we are really processing a date, 'third' will have to be numeric to make sense. However, we might be processing something like Wednesday, 26 May 1981, in which there is a valid date, but 'first' isn't pointing to it yet (it's still on Wednesday). So, we don't check that 'third' is numeric until we are sure that 'first' is pointing to something reasonable.
WITH f: input.tokens[first]
SELECT
FROM
alpha => {
The first token is alpha. If it isn't a recognizable month, it might be some form of the day of the week. Whatever it is, we simply skip over it (and the separator following it, if any) and try again.
month ← ParseSymbolicMonth[f ! BogusMonth => {first ← second; LOOP}];
'first' corresponds to a valid month. We now require that 'second' be the day of the month and 'third' be the year.
WITH d: input.tokens[second]
SELECT
FROM
num => day ← CollectDay[d, month];
ENDCASE => Bogus[second];
GetThird[];
year ← CollectYear[t]};
num => {
GetThird[];
WITH s: input.tokens[second]
SELECT
FROM
alpha => {
The second token is alpha, so we will require it to be the month.
month ← ParseSymbolicMonth[s ! BogusMonth => Bogus[second]];
Now we must decide if we have <day><month><year> or <year><month><day>
day ← CollectDay[f, month ! OutOfRange => GO TO yearFirst];
year ← CollectYear[t];
EXITS
yearFirst => {
year ← CollectYear[f];
day ← CollectDay[t, month]}};
num => {
All three parts are numeric.
year ← CollectYear[t ! OutOfRange => GO TO yearFirst];
month ← CollectMonth[f ! OutOfRange => GO TO dayFirst];
day ← CollectDay[s, month];
EXITS
yearFirst => {
year ← CollectYear[f];
month ← CollectMonth[s];
day ← CollectDay[t, month]};
dayFirst => {
month ← CollectMonth[s];
day ← CollectDay[f, month]}};
ENDCASE};
ENDCASE;
next ← third + 1;
EXIT
ENDLOOP;
Be sure we weren't slipped a bogus leap year...
IF month = 2 AND day = 29 AND ~LeapYear[year] THEN Bogus[next];
};
ParseTime:
PROCEDURE [first: TokenIndex]
RETURNS [next: TokenIndex] = {
ENABLE OutOfRange => ERROR Unintelligible[errorPos];
CollectHour:
PROCEDURE [h: num Token]
RETURNS [Hour] =
{RETURN[CollectValue[h, FIRST[Hour], LAST[Hour]]]};
CollectMinute:
PROCEDURE [m: num Token]
RETURNS [Minute] =
INLINE
{RETURN[CollectValue[m, FIRST[Minute], LAST[Minute]]]};
CollectSecond:
PROCEDURE [s: num Token]
RETURNS [Second] =
INLINE
{RETURN[CollectValue[s, FIRST[Second], LAST[Second]]]};
AMorPM:
PROCEDURE [t: TokenIndex]
RETURNS [
BOOLEAN] = {
WITH m: input.tokens[t]
SELECT
FROM
alpha =>
IF m.length = 2
AND String.UpperCase[s[m.offset+1]] = 'M
THEN {
offset: Hour ← 0;
SELECT String.UpperCase[s[m.offset]]
FROM
'A => offset ← 0;
'P => offset ← 12;
ENDCASE => RETURN [FALSE];
IF ~(hour IN [1..12]) THEN Bogus[t];
hour ← (hour MOD 12) + offset;
RETURN [TRUE]};
ENDCASE;
RETURN [FALSE]
};
n: num Token;
next ← first + 1;
hour ← minute ← second ← 0;
WITH f: input.tokens[first]
SELECT
FROM
num => n ← f;
sep =>
IF f.char = sentinel THEN {notes ← noTime; RETURN}
ELSE Bogus[first];
ENDCASE => Bogus[first];
SELECT n.length
FROM
1, 2, 4, 6 => NULL;
ENDCASE => Bogus[first];
hour ← CollectHour[[n.offset, num[MIN[n.length, 2]]]];
IF n.length <= 2
THEN {
A separator must follow a 1- or 2- digit hour field.
IF next + 1 >= input.nTokens OR input.tokens[next].type ~= sep THEN Bogus[next];
Assert: input.tokens[next+1] is alpha or num.
next ← next + 1;
WITH s: input.tokens[next]
SELECT
FROM
num => n ← s;
ENDCASE => Bogus[next];
SELECT n.length
FROM
2, 4 => next ← next + 1;
ENDCASE => Bogus[next]}
ELSE {n.offset ← n.offset + 2; n.length ← n.length - 2};
minute ← CollectMinute[[n.offset, num[2]]];
Assert: 'next' indexes the token following the minutes.
IF n.length > 2
THEN
The minutes and seconds are concatenated.
second ← CollectSecond[[n.offset+2, num[2]]]
ELSE
Now look for optional seconds field. We assume it is present if there is a colon separator following the minutes.
WITH sep: input.tokens[next]
SELECT
FROM
sep =>
IF sep.char = ':
THEN
Assert: input.tokens[next+1] is alpha or num.
WITH s: input.tokens[next+1]
SELECT
FROM
num => {second ← CollectSecond[s]; next ← next + 2};
ENDCASE => Bogus[next+1];
ENDCASE;
Assert: 'next' indexes the token following the last time part (minutes or seconds).
WITH s: input.tokens[next]
SELECT
FROM
sep => IF s.char ~= sentinel AND AMorPM[next+1] THEN next ← next + 2;
alpha => IF AMorPM[next] THEN next ← next + 1;
ENDCASE;
};
ParseZone:
PROCEDURE [first: TokenIndex]
RETURNS [next: TokenIndex] = {
ENABLE OutOfRange => ERROR Unintelligible[errorPos];
BadZone: ERROR = CODE;
CollectSymbolic:
PROCEDURE [z: alpha Token] = {
char: CHARACTER ← s[z.offset];
SELECT z.length
FROM
1 => {
zones:
PACKED
ARRAY ZoneIndex
OF
CHARACTER = [
'M, 'L, 'K, 'I, 'H, 'G, 'F, 'E, 'D, 'C, 'B, 'A,
'Z,
'N, 'O, 'P, 'Q, 'R, 'S, 'T, 'U, 'V, 'W, 'X, 'Y];
FOR hour: ZoneIndex
IN ZoneIndex
DO
IF char = zones[hour]
THEN
{zoneAdjust ← hour*minutesPerHour; EXIT};
REPEAT
FINISHED => GO TO badZone;
ENDLOOP};
3 => {
naZones:
PACKED
ARRAY NAZones
OF
CHARACTER = [
'B, 'H, 'Y, 'P, 'M, 'C, 'E, 'A];
IF String.UpperCase[s[z.offset+2]] ~= 'T THEN GO TO badZone;
SELECT String.UpperCase[s[z.offset+1]]
FROM
'S, 'M => NULL; -- treat "mean" same as "standard"
'D => dst ← TRUE;
ENDCASE => GO TO badZone;
SELECT char
FROM
'G => IF dst THEN GO TO badZone;
'N =>
IF dst THEN GO TO badZone
ELSE zoneAdjust ← -(3*minutesPerHour+30);
ENDCASE =>
FOR hour: NAZones
IN NAZones
DO
IF char = naZones[hour]
THEN
{zoneAdjust ← hour*minutesPerHour; EXIT};
REPEAT
FINISHED => GO TO badZone;
ENDLOOP};
ENDCASE => GO TO badZone;
EXITS
badZone => ERROR BadZone;
};
CollectAbsolute:
PROCEDURE [z: num Token]
RETURNS [DeltaMinutes] = {
hour: Hour;
hourLength: NAT;
minute: Minute ← 0;
IF ~(z.length IN [1..4]) THEN ERROR OutOfRange[z.offset+z.length];
hourLength ← 2 - (z.length MOD 2);
hour ←
CollectValue[[z.offset, num[hourLength]], FIRST[Hour], LAST[Hour]];
IF z.length > 2
THEN
minute ←
CollectValue[[z.offset+hourLength, num[2]], FIRST[Minute], LAST[Minute]];
RETURN [hour*minutesPerHour+minute]
};
zoneAdjust ← 0; dst ← FALSE;
next ← first;
IF notes = noTime THEN RETURN;
WITH z: input.tokens[next]
SELECT
FROM
sep => {
zone: Token;
SELECT z.char
FROM
Ascii.SP, '+, '- => zone ← input.tokens[next + 1];
ENDCASE => GO TO noZone; -- includes sentinel
WITH zone: zone
SELECT
FROM
num =>
SELECT z.char
FROM
Ascii.SP => GO TO noZone;
'+ => zoneAdjust ← CollectAbsolute[zone];
'- => zoneAdjust ← -CollectAbsolute[zone];
ENDCASE; -- can't happen
alpha =>
IF z.char = '+ THEN Bogus[next]
ELSE
CollectSymbolic[zone ! BadZone =>
IF z.char = Ascii.SP THEN GO TO noZone ELSE Bogus[next+1]];
ENDCASE; -- can't happen
next ← next + 1};
alpha => CollectSymbolic[z ! BadZone => GO TO noZone];
ENDCASE => GO TO noZone;
EXITS
noZone => notes ← noZone;
};
LeapYear:
PROCEDURE [year: Year]
RETURNS [
BOOLEAN] = {
RETURN[year MOD 4 = 0 AND (year MOD 100 ~= 0 OR year MOD 400 = 0)]};
ConsumeSpace:
PROCEDURE [t: TokenIndex]
RETURNS [TokenIndex] = {
WITH s: input.tokens[t]
SELECT
FROM
sep => SELECT s.char FROM Ascii.SP, ', => RETURN[t+1]; ENDCASE;
ENDCASE;
RETURN[t]
};
AssembleGMT:
PROCEDURE
RETURNS [System.GreenwichMeanTime] = {
DetermineZoneCorrection:
PROCEDURE = {
We need to compute the local time parameters in effect at this zone on the date in question. Note that this is an imperfect simulation, since it assumes uniform dates and times for start and end of daylight savings time within the local zone.
FirstSundayAfter:
PROCEDURE [ed: EpochDays]
RETURNS [EpochDays] =
{RETURN [ed+7-((ed+baseDay) MOD 7)]};
ltp: System.LocalTimeParameters ←
[direction: west, zone: 8, zoneMinutes: 0, beginDST: , endDST: ];
ltp ← System.GetLocalTimeParameters[
! System.LocalTimeParametersUnknown => {
notes ← IF notes = noZone THEN zoneGuessed ELSE timeAndZoneGuessed;
CONTINUE}];
ltp.beginDST ← FirstSundayAfter[
epochToJan1 + --Apr23-- (112+(IF LeapYear[year] THEN 1 ELSE 0))] - epochToJan1;
ltp.endDST ← FirstSundayAfter[
epochToJan1 + --Oct24-- (296+(IF LeapYear[year] THEN 1 ELSE 0))] - epochToJan1;
zoneAdjust ← ltp.zone * minutesPerHour + ltp.zoneMinutes;
IF ltp.direction = west THEN zoneAdjust ← - zoneAdjust;
dst ←
SELECT jan1ToDate
FROM
IN (ltp.beginDST..ltp.endDST) => TRUE,
= ltp.beginDST => (hour >= 2),
= ltp.endDST => (hour < 2),
ENDCASE => FALSE;
};
YearBoundaries:
PROCEDURE [year: Year, span:
NAT]
RETURNS [
NAT] =
INLINE
The following expression computes the number of years y in [FIRST[Year]..year) for which y MOD span is 0.
{RETURN[(year+span-1)/span - (FIRST[Year]+span-1)/span]};
epochToJan1: EpochDays ← (year -
FIRST[Year])*365 +
YearBoundaries[year, 4] - YearBoundaries[year, 100] + YearBoundaries[year, 400];
jan1ToDate: YearDays ←
cumDays[month-1] + (day-1) - (IF ~LeapYear[year] AND month > 2 THEN 1 ELSE 0);
IF notes ~= normal THEN DetermineZoneCorrection[];
The following gross arithmetic is required to avoid overflows.
RETURN [System.GreenwichMeanTime[
((epochToJan1+jan1ToDate)+(System.gmtEpoch/secondsPerDay))*secondsPerDay +
((
INT[hour]-
(IF dst THEN 1 ELSE 0))*minutesPerHour+minute-zoneAdjust)*secondsPerMinute+
second]];
};
*** Main Body of Parse ***
nextToken: TokenIndex ← SUCC[FIRST[TokenIndex]];
notes ← normal;
BEGIN
ENABLE Unintelligible => Heap.systemZone.FREE[@input];
Tokenize[];
Assert: The input has been tokenized such that:
input.tokens[0].kind = input.tokens[input.nTokens-1].kind = sep[sentinel]]
for all i IN (0..input.nTokens-1):
if input.tokens[i].type = sep, then
input.tokens[i-1].type ~= sep and input.tokens[i+1].type ~= sep
nextToken ← ParseDate[nextToken ! Unintelligible => GO TO tryTimeFirst];
nextToken ← ConsumeSpace[nextToken];
nextToken ← ParseTime[nextToken];
nextToken ← ParseZone[nextToken];
EXITS
tryTimeFirst => {
nextToken ← ParseTime[nextToken];
nextToken ← ParseZone[nextToken];
nextToken ← ConsumeSpace[nextToken];
nextToken ← ParseDate[nextToken];
};
END;
dt ← AssembleGMT[];
length ← input.tokens[nextToken].offset;
Heap.systemZone.FREE[@input];
};