File [phylum]<JLisp>CHARDEFS.MESA!1

-- CharDefs 

-- Revised by Tse: 	  17-Mar-83 14:13:51 
-- Owner: GCurry

--Public definitions for simple (16-bit) characters.

--A simple character is one which has a 16-bit representation within the Star framework.

--Character Set - The 16-bit character space is partitioned into uniform-sized blocks of 400B, called character sets.  The character set space is partitioned into groups corresponding to anticipated usage.  For example, character sets 0 through 57B. representing the first 60B x 400B characters are dedicated to non-Kanji (i.e., non-ideographic) characters; character sets 60b through 117B are devoted to Japanese Kanji, and 120B through 337B are reserved for Chinese Kanji.  Eash of these major partitions is further partitioned by sub-function (DIscussion of these is deferred until a final specification becomes available - Joe Becker is considering these assignments).  Character sets 340B through 377B are not currently used.  

DIRECTORY
  CharADefs USING [Codes0, Codes1, Codes2, Codes3, Codes4],
  CharBDefs USING [Codes20, Codes21, Codes40],
  StandardDefs USING [Cv];

CharDefs: DEFINITIONS =
  BEGIN OPEN StandardDefs;

  --====================
  -- ON THE REPRESENTATION OF CHARACTERS :  
  --Some preliminary experiments have shown that the code generated to deal with text may be rather sensitive to the (machine) representation of characters.  For this reason we would like to defer the question of the "official" character representation, while admitting the possibility of a two word representation.  If you need to know the SIZE of the type "Char" (as opposed to knowing only that it has at two fields called chset and code), please tell Gael Curry.  
  --====================
  Class: TYPE = {
    ClassBreak, ClassRoman, ClassHiragana, ClassKatakana, ClassKanji};

  Char: TYPE = MACHINE DEPENDENT RECORD [  --Character 
    chset(0:0..7): Chset,  --Public field
    code(0:8..15): Code];  --Public field
    
  --WARNING !!!!!!!
  Chset: TYPE = CARDINAL [0..340B);  --!!!!!! FILED IN DU AS PART OF ASPECT TYPE (see WSFontDefs).  Check with appropriate group before changing
  --WARNING !!!!!!!
  
  Chsetsnonkanji: TYPE = Chset [0B..60B);
  Chsetskanji: TYPE = Chset [60B..340B);
  chsetRoman: Chset = 0;
  chsetGreek: Chset = 1;
  chsetRussian: Chset = 1;
  chsetCyrillic: Chset = 1;
  chsetKana: Chset = 2;
  chsetHebrew: Chset = 3;
  chsetBopomofo: Chset = 4;
  chsetSymbol: Chset = 20B;
  -- chsetJSymbol: Chset = 21B; ++ This is on a later page.
  chsetRendering: Chset = 40B;
  chsetTiles: CharDefs.Chset = LAST[CharDefs.Chset];


  Code: TYPE = CARDINAL [0..400B);

  --====================
  -- ACCENTED CHARACTERS :  
  --This is included here for future compatibility.  We hope to represent accented characters in the workstation the same as on the net.  See also WSCharDefs.AqPlainCharacter.  
  --====================

  AccentedChar: TYPE = MACHINE DEPENDENT RECORD [  --Accented character 
    accents(0:0..15): PACKED ARRAY Codes0 [unused300B..accentHacek] OF BOOLEAN,
    char(1:0..15): Char];

  --FUNCTIONS ON CHARACTERS

  --====================
  -- ON THE "MEANINGS" OF CHARACTERS :
  --Different characters designations may mean the same.  There are at least two ways to deal with this possibility - (1) invent a space of meanings and map designations to it, and (2) choose a canonical designation for each meaning.  We will try to use the second approach.  To be precise, the space of designations is partitioned into equivalence classes based on equality of meaning, and a representative from each class is chosen as the canonical designation.
  --====================

  Meaning: PRIVATE PROCEDURE [char: Char] RETURNS [Char] = INLINE
    --Returns the canonical designation for char.
    BEGIN
    --Used to worry about aliasing here
    RETURN[char]
    END;

  MeaningEqual: PROCEDURE [char1, char2: Char] RETURNS [BOOLEAN] = INLINE
    --Determines whether char1 means the same as char2.
    BEGIN
    RETURN[Meaning[char1] = Meaning[char2]];  --There are faster ways
    END;

  TextMeaningEqual: PROCEDURE [char1, char2: Char] RETURNS [BOOLEAN] = INLINE
    --The Star Functional Specification provides that "Match on Text" ignores case.  This function provides that test.  
    BEGIN
    RETURN[MeaningEqual[UpperCase[char1], UpperCase[char2]]];  --There are faster ways
    END;

  --====================
  -- ON THE "ORDERINGS" OF CHARACTERS :
  -- Characters also are comparable under (at least) one notion of order.  Note, however, that this is not the ordering that Mesa provides on the representation of "Char".   
  --====================

  Ordering: TYPE = MACHINE DEPENDENT{none(0), less(1), equal(2), greater(3)};

  Order: PROCEDURE [char1, char2: Char] RETURNS [Ordering];
  --Ordering determines the ordering between characters for the purpose of sorting.  Note that 
  --"Order[ char1, char2 ] = equal" isn't necessarily equivalent to 
  --"MeaningEqual [ char1, char2 ]", or to
  --"TextMeaningEqual [ char1, char2 ]".

  --====================
  -- MISCELLANEOUS FUNCTIONS ON CHARACTERS :
  -- Some of the functions below are tied to the Star Functional Specification more tightly than those above.  In order to protect themselves against changes to the FS, clients should use functions on simple characters provided by this interface whenever possible (Please tell us about oversights).   
  --====================

  UpperCase: PROCEDURE [char: Char] RETURNS [Char] = INLINE
    --UpperCase returns the upper-case counterpart of character char.  It is not clear what upper case MEANS for an arbitrary character, but whatever it comes to mean for Star, this operation implements it.
    BEGIN
    SELECT char.chset FROM
      chsetRendering => 
	  IF LOOPHOLE[char.code, Codes40] = Codes40[sigmaFinal] THEN char ← [chsetGreek, LOOPHOLE[Codes1[upperSigma], Code]]
	  ELSE char.code ← UpperCase40[char.code];
      ENDCASE =>
        char.code ←
          SELECT char.chset FROM
            chsetRoman => UpperCase0[char.code],
            chsetGreek, chsetCyrillic => UpperCase1[char.code],
        --Add arms as necessary for other alphabets,
          ENDCASE => char.code;
	
    RETURN[char];
    END;

  LowerCase: PROCEDURE [char: Char] RETURNS [Char] = INLINE
    --LowerCase returns the lower-case counterpart of character char.  It is not clear what lower case MEANS for an arbitrary character, but whatever it comes to mean for Star, this operation implements it.
    BEGIN
    char.code ←
      SELECT char.chset FROM
        chsetRoman => LowerCase0[char.code],
        chsetGreek, chsetCyrillic => LowerCase1[char.code],
        chsetRendering => LowerCase40[char.code],
        --Add arms as necessary for other alphabets,
        ENDCASE => char.code;
    RETURN[char];
    END;

  Break: PROCEDURE [char: Char] RETURNS [BOOLEAN];
  --Determines whether char has the "break" property.

  Word: PROCEDURE [char: Char] RETURNS [BOOLEAN] = INLINE
    --Determines whether char has the "word" property.
    BEGIN RETURN[NOT Break[char]]; END;

  IsAlphanumeric: PROCEDURE [char: Char] RETURNS[BOOLEAN];
  -- Determines whether char is an alphanumeric character in it's charset.
  -- I.e. IsAlphanumeric will return TRUE if char is a digit, 0-9, or if the charset in char is language charset and the code represents a character in the alphabet of that language.  Otherwise it will return FALSE.

  -- Char Sets:
  chsetJSymbol: Chset = 21B;  -- General & Japanese Symbols.
  -- JIS-1:
  chsetKanji0: Chsetskanji = FIRST[Chsetskanji];
  chsetKanji1: Chsetskanji = chsetKanji0 + 1;
  chsetKanji2: Chsetskanji = chsetKanji0 + 2;
  chsetKanji3: Chsetskanji = chsetKanji0 + 3;
  chsetKanji4: Chsetskanji = chsetKanji0 + 4;
  chsetKanji5: Chsetskanji = chsetKanji0 + 5;
  chsetKanji6: Chsetskanji = chsetKanji0 + 6;
  chsetKanji7: Chsetskanji = chsetKanji0 + 7;
  chsetKanji8: Chsetskanji = chsetKanji0 + 8;
  chsetKanji9: Chsetskanji = chsetKanji0 + 9;
  chsetKanji10: Chsetskanji = chsetKanji0 + 10;
  chsetKanji11: Chsetskanji = chsetKanji0 + 11;
  -- JIS-2:
  chsetKanji12: Chsetskanji = chsetKanji0 + 12;
  chsetKanji13: Chsetskanji = chsetKanji0 + 13;
  chsetKanji14: Chsetskanji = chsetKanji0 + 14;
  chsetKanji15: Chsetskanji = chsetKanji0 + 15;
  chsetKanji16: Chsetskanji = chsetKanji0 + 16;
  chsetKanji17: Chsetskanji = chsetKanji0 + 17;
  chsetKanji18: Chsetskanji = chsetKanji0 + 18;
  chsetKanji19: Chsetskanji = chsetKanji0 + 19;
  chsetKanji20: Chsetskanji = chsetKanji0 + 20;
  chsetKanji21: Chsetskanji = chsetKanji0 + 21;
  chsetKanji22: Chsetskanji = chsetKanji0 + 22;
  chsetKanji23: Chsetskanji = chsetKanji0 + 23;
  chsetKanji24: Chsetskanji = chsetKanji0 + 24;
  chsetKanji25: Chsetskanji = chsetKanji0 + 25;


  -- TYPES AND CONSTANTS FOR JAPANESE LOOKUP SYSTEM:

  -- Types:
  CharLookup: TYPE = Char;  -- Lookup char.
  -- Constants:
  charKanjiLookup: Char = Roman[space];  -- Kanji lookup char.


  --CHARACTER SET 0  

  -- Codes which are not to be used in a particular character set are named "unused#B"; codes which are available, but unassigned are named "available#B"  The actual code assignments may change up to a point, so clients should reference characters symbolically whenever possible. 

  Codes0: TYPE = CharADefs.Codes0;
  romanFL: Code = LOOPHOLE[Codes0[lowerA]];  -- First lowercase Roman char
  romanLL: Code = LOOPHOLE[Codes0[lowerZ]];  -- Last lowercase Roman char
  romanFU: Code = LOOPHOLE[Codes0[upperA]];  -- First uppercase Roman char
  romanLU: Code = LOOPHOLE[Codes0[upperZ]];  -- Last uppercase Roman char
  digitF: Code = LOOPHOLE[Codes0[digit0]];  -- First digit
  digitL: Code = LOOPHOLE[Codes0[digit9]];  -- Last digit
  space: Code = LOOPHOLE[Codes0[space]];  -- space

  newLine: Code = LOOPHOLE[Codes0[newLine]];
  newPar: Code = LOOPHOLE[Codes0[newParagraph]];


  quote: Code = LOOPHOLE[Codes0[rightQuote]];
  doubleQuote: Code = LOOPHOLE[Codes0[doubleQuote]];
  rightParen: Code = LOOPHOLE[Codes0[rightParenthesis]];
  leftParen: Code = LOOPHOLE[Codes0[leftParenthesis]];
  period: Code = LOOPHOLE[Codes0[period]];
  exclamMark: Code = LOOPHOLE[Codes0[exclamationMark]];
  questionMark: Code = LOOPHOLE[Codes0[questionMark]];

  --CHARACTER SET 1: Greek and Cyrillic alphabets

  Codes1: TYPE = CharADefs.Codes1;

  --CHARACTER SET 2: Japanese "Hiragana & Katakana"  

  Codes2: TYPE = CharADefs.Codes2;

  --CHARACTER SET 3: Hebrew

  Codes3: TYPE = CharADefs.Codes3;

  -- Other Type:
  RgHiragana: TYPE = [hirSmallA..hirN];
  RgKatakana: TYPE = [katSmallA..katSmallKe];

  -- Constants:
  -- chars:
  charChouon: Char = Kana[chouon];  -- chouon(long vowel)
  -- codes:
  hiraganaF: Code = LOOPHOLE[Codes2[hirSmallA]];  -- First Hiragana
  hiraganaL: Code = LOOPHOLE[Codes2[hirN]];  -- Last Hiragana
  katakanaF: Code = LOOPHOLE[Codes2[katSmallA]];  -- Fisrt Katakana
  katakanaL: Code = LOOPHOLE[Codes2[katSmallKe]];  -- Last Katakana

  hirKurikaeshi: Code = LOOPHOLE[Codes2[hirKurikaesi]];  -- Hiragana repeat
  hirKurikaeshiDakuon: Code = LOOPHOLE[Codes2[hirKurikaesiDakuon]];  -- Hiragana dakuon repeat
  katChouon: Code = charChouon.code;  -- Katakana long vowel

  hirSmallA: Code = LOOPHOLE[Codes2[hirSmallA]];
  hirN: Code = LOOPHOLE[Codes2[hirN]];
  hirKurikaesi: Code = LOOPHOLE[Codes2[hirKurikaesi]];  -- hiragana repeat
  hirKurikaesiDakuon: Code = LOOPHOLE[Codes2[hirKurikaesiDakuon]];  -- hiragana dakuon repeat
  katSmallA: Code = LOOPHOLE[Codes2[katSmallA]];
  katN: Code = LOOPHOLE[Codes2[katN]];
  katSmallKe: Code = LOOPHOLE[Codes2[katSmallKe]];
  katKurikaesi: Code = LOOPHOLE[Codes2[katKurikaesi]];  -- katakana repeat
  katKurikaesiDakuon: Code = LOOPHOLE[Codes2[katKurikaesiDakuon]];  -- katakana dakuon repeat
  kanjiKuriKaeshi: Code = 0B;  -- Kanji repeat
  touten: Code = LOOPHOLE[Codes2[touten]];  -- Japanese comma
  kuten: Code = LOOPHOLE[Codes2[kuten]];  -- Japanese period
  hajimeKagiKakko: Code = LOOPHOLE[Codes2[hajimeKagiKakko]];  -- Japanese open quote
  owariKagiKakko: Code = LOOPHOLE[Codes2[owariKagiKakko]];  -- Japanese closed quote
  hajimeNijuKagiKakko: Code = LOOPHOLE[Codes2[hajimeNijuKagiKakko]];  -- Japanese open double quote
  owariNijuKagiKakko: Code = LOOPHOLE[Codes2[owariNijuKagiKakko]];  -- Japanese cloded double quote
  -- offsets:
  offsetHiragana: Cv = hirSmallA;
  offsetKatakana: Cv = katSmallA;
  offsetKatHir: Cv = offsetKatakana - offsetHiragana;


  --CHARACTER SET 4: Chinese Bopomofo Phonic Characters

  Codes4: TYPE = CharADefs.Codes4;


  --CHARACTER SET 20B: General and Technical symbols

  Codes20: TYPE = CharBDefs.Codes20;

  --CHARACTER SET 21B: General & Japanese symbols  

  Codes21: TYPE = CharBDefs.Codes21;


  --CHARACTER SET 40B: Ligatures etc.  

  Codes40: TYPE = CharBDefs.Codes40;


  --CHARACTER SET 60B throuch 73B: Japanese Kanji JIS-1  
  -- Types:
  RgChsetKanjiJIS1: TYPE = [chsetKanji0..chsetKanji11];  -- JIS-1 Kanji chsets

  -- Constants:
  -- chars:
  charKanjiKurikaesi: Char = [chsetKanji0, 0B];
  charFirstKanjiJIS1: Char = [chsetKanji0, 0B];
  charLastKanjiJIS1: Char = [chsetKanji11, 241B];
  -- chars: numbers
  charKansuji0: Char = [chsetKanji0, 1B];
  charKansuji1: Char = [chsetKanji0, 3B];
  charKansuji2: Char = [chsetKanji0, 4B];
  charKansuji3: Char = [chsetKanji0, 5B];
  charKansuji4: Char = [chsetKanji0, 26B];
  charKansuji5: Char = [chsetKanji0, 13B];
  charKansuji6: Char = [chsetKanji0, 50B];
  charKansuji7: Char = [chsetKanji0, 70B];
  charKansuji8: Char = [chsetKanji0, 47B];
  charKansuji9: Char = [chsetKanji0, 104B];
  charKansujiJuu: Char = [chsetKanji0, 10B];
  charKansujiHyaku: Char = [chsetKanji0, 146B];
  charKansujiSen: Char = [chsetKanji0, 55B];
  charKansujiMan: Char = [chsetKanji0, 17B];
  charKansujiOku: Char = [chsetKanji1, 362B];
  charKansujiCho: Char = [chsetKanji5, 275B];

  -- CHARACTER SET 74B throuch 111B: Japanese Kanji JIS-2  

  -- Types:
  RgChsetKanjiJIS2: TYPE = [chsetKanji12..chsetKanji25];  -- JIS-2 Kanji chsets

  -- Constants:
  -- chars:
  charFirstKanjiJIS2: Char = [chsetKanji12, 0B];
  charLastKanjiJIS2: Char = [chsetKanji25, 104B];

    UpperCase0: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the upper-casification of the indicated (assumed chset 0) code
    BEGIN
    code0: Codes0 = LOOPHOLE[code];
    RETURN[
      SELECT code0 FROM
        IN [lowerA..lowerZ] => code - 40B,
        IN [lowerAEdipthong..lowerDstroke], IN [lowerIJligature..lowerOEligature],
          IN [lowerThorn..lowerEng] => code - 10B,
        ENDCASE => code];
    END;

  UpperCase1: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the upper-casification of the indicated (assumed chset 1) code
    BEGIN
    code1: Codes1 = LOOPHOLE[code];
    RETURN[
      SELECT code1 FROM
        IN [lowerAlpha..lowerOmega] => code - 40B,  -- lower case Greek
        IN [lowerA..lowerYa] => code - 60B,  -- lower case Cyrillic
        ENDCASE => code];
    END;

  UpperCase40: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the upper-casification of the indicated (assumed chset 40) code
    BEGIN
    code40: Codes40 = LOOPHOLE[code];
    RETURN[
      SELECT code40 FROM
        IN [upperAring..lowerCcedilla] => 
	  IF LOOPHOLE[code40] MOD 2 = 0B  -- even codes are lowercase 
            THEN code - 1B 
	  ELSE code,
	ENDCASE => code];
    END;

  LowerCase0: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the lower-casification of the indicated (assumed roman) code
    BEGIN
    code0: Codes0 = LOOPHOLE[code];
    RETURN[
      SELECT code0 FROM
        IN [upperA..upperZ] => code + 40B,
        IN [upperAEdipthong..upperDstroke], IN [upperIJligature..upperOEligature],
          IN [upperThorn..upperEng] => code + 10B,
        ENDCASE => code];
    END;

  LowerCase1: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the lower-casification of the indicated (assumed chset 1) code
    BEGIN
    code1: Codes1 = LOOPHOLE[code];
    RETURN[
      SELECT code1 FROM
        IN [upperAlpha..upperOmega] => code + 40B,  -- upper case Greek
        IN [upperA..upperYa] => code + 60B,  -- upper case Cyrillic
        ENDCASE => code];
    END;

  LowerCase40: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    --Returns the code representing the lower-casification of the indicated (assumed chset 40) code
    BEGIN
    code40: Codes40 = LOOPHOLE[code];
    RETURN[
      SELECT code40 FROM
        IN [upperAring..lowerCcedilla] => 
	  IF LOOPHOLE[code40] MOD 2 = 1  -- odd codes are uppercase
            THEN code + 1B 
	  ELSE code,
	ENDCASE => code];
    END;

  Number0: PROCEDURE [code: Code] RETURNS [CARDINAL] = INLINE
    --Returns the numeric value of the (assumed Roman digit) code
    BEGIN RETURN[code - LOOPHOLE[Codes0[digit0], CARDINAL]]; END;

  Roman: PROCEDURE [codes0: Codes0] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetRoman, LOOPHOLE[codes0]]]; END;


  Greek, Russian, Cyrillic: PROCEDURE [codes1: Codes1] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetCyrillic, LOOPHOLE[codes1]]]; END;


  Kana: PROCEDURE [codes2: Codes2] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetKana, LOOPHOLE[codes2]]]; END;

  Hebrew: PROCEDURE [codes3: Codes3] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetHebrew, LOOPHOLE[codes3]]]; END;

  Bopomofo: PROCEDURE [codes4: Codes4] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetBopomofo, LOOPHOLE[codes4]]]; END;


    -- Get Hiragana: Convert katakana to hiragana. Don't convert if code is not katakana code.
  Hiragana: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    BEGIN
    RETURN[
      IF code IN [katSmallA..katN] OR code IN [katKurikaesi..katKurikaesiDakuon]
      THEN code - offsetKatHir ELSE code];
    END;


    -- Get Katakana: Convert hiragana to katakana. Don't convert if code is not hiragana code.
  Katakana: PROCEDURE [code: Code] RETURNS [Code] = INLINE
    BEGIN
    RETURN[
      IF code IN RgHiragana OR code IN [hirKurikaesi..hirKurikaesiDakuon] THEN
      code + offsetKatHir ELSE code];
    END;


  Symbol: PROCEDURE [codes20: Codes20] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetSymbol, LOOPHOLE[codes20]]]; END;


  JSymbol: PROCEDURE [codes21: Codes21] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetJSymbol, LOOPHOLE[codes21]]]; END;


  Rendering: PROCEDURE [codes40: Codes40] RETURNS [Char] = INLINE
    BEGIN RETURN[[chsetRendering, LOOPHOLE[codes40]]]; END;

  END. -- of CharDefs

LOG
September 8, 1980  7:53 AM	GCurry	created
December 27, 1980  11:02 AM	GCurry	renamed, consolidated various files
January 7, 1981  5:01 PM	S. Finkel	Fixed MeaningEqual
January 10, 1981  2:57 PM	GCurry	Make Chset PUBLIC
January 25, 1981  7:21 PM	GCurry	Reflect new Cosmopolitan ranges, new character set 0 definition.
February 9, 1981  9:11 AM	GCurry	Update to OIS Character Set.
March 9, 1981  10:47 AM	Finkel	Add JStar char definitions.
April 4, 1981  3:39 PM	Mader	Add Character set 1, and 20 definitions, changed character set 21 to JSymbol.
June 16, 1981 12:36 PM	Gittins	Add accented chars (Character set 40B)
June 19, 1981  11:55 AM	Gittins	Add chsetRendering  & Upper/Lowercase40
				  Add names>340B to charset 0, andextend upper/lower case. 
July 2, 1981  11:18 AM	Buelow	Move Codes0, Codes1, Codes2 to CharADefs; Codes20, Codes21, Codes40 to CharBDefs for Star 19.1. 
July 31, 1981  10:50 AM	Morrison	Added chsetHebrew, Hebrew, Codes3. 
December 7, 1981  11:27 AM	Otto	Added chsetTile, formerly in CharForgotDefs
22-Jan-82 13:46:02 Tripp Added Class definition and several character constants.
 7-Feb-82 18:40:09	Laaser	Added IsAlphanumeric interface principally for Cusp in the international product.
11-Mar-82	K.Akada	- Fixed Proc Hiragana and Katakana .
16-Jun-82  8:48:25	Tripp	Extend character set range for Chinese kanji.
14-Jul-82 18:45:04	Finkel	Added warning comment about changing Chset TYPE
 8-Sep-82 21:34:12	Becker/Tripp	Added Character Set 4:  Chinese "Bopomofo" Phonetic Alphabet
17-Mar-83 14:15:56	Tse	 Return the original char if no uppercase/lowercase in UpperCase40 & LowerCase40.  Special case for lower case sigma char.