File [phylum]<3-LISP>course>PROBLEM-SETS>LEXICAL-ANALYSER.3-LISP!14


;;; Code stored as:  {turing}<3-lisp.problem-sets>lexical-analyser.3-lisp
;;;                  {phylum}<3-lisp>course>problem-sets>lexical-analyser.3-lisp


;;;            ======================== 
;;;            3-LISP Lexical Analyser:
;;;            ========================

;;; ===============================================================
;;;          PUBLIC ROUTINES
;;; ===============================================================

;;; (INIT-LEXAN)     Initializes all tables.
;;; ============
;;;     
;;; (LEXAN-OBJECT string)  A type of object that embodies the tokenization of a 
;;; =====================  given string.  Supports the following messages:
;;;     
;;;    (CURRENT-TOKEN lexan-object)  Returns the token currently being scanned (i.e.,.
;;;    ============================  the one made current by the last use of GET-TOKEN)
;;;
;;;    (GET-TOKEN lexan-object)      Also returns the current token being scanned but 
;;;    ========================      moves past the token in the input.  This is the
;;;                                  normal method to be used in accessing tokens.
;;;
;;;    (NEXT-TOKEN lexan-object)     Returns the 'look-ahead' token, the token just
;;;    =========================     about to be scanned.  Useful for dispatching without
;;;                                  actually reading the token.
;;; 
;;; (LEXICAL-ANALYSIS string)        Designates the sequence of tokens into which
;;; =========================        string is lexically analyzed.  For example
;;;                                  (lexical-analysis "a (b)") => ["a" "(" "b" ")"].
;;; 
;;; (IDENTIFIER-EXPRESSION token)  True just in case token is a sequence of alphanumeric
;;; =============================  characters, but is not a numeral-expression.
;;; 
;;; (NUMERAL-EXPRESSION token)     True just in case token is a sequence of digits, 
;;; ==========================     possibly including a leading "+" or "-".


(define INIT-LEXAN
   (lambda []
      (set *special-chars* [ #( #) #[ #] #$ #' #, #. #↑ #\ #{ #} ])
      (set *white-space-chars* [ #  cr ])
      (set *delimiters* (append *white-space-chars* *special-chars*))
      (set *digits* [ #0 #1 #2 #3 #4 #5 #6 #7 #8 #9 ])
      (set *comment-begin* #;)
      (set *comment-end* cr)
      "ok"))

(define LEXAN-OBJECT
   (let [[create-lexan-object
            (object [string]
                    [[current-tok ""]
                     [next-tok ""]]
               [GET-TOKEN
                  (lambda []
                     (let [[token-info (global-get-token string)]]
                        (set string (second token-info))
                        (set current-tok next-tok)
                        (set next-tok (first token-info))
                        current-tok))]
               [CURRENT-TOKEN (lambda [] current-tok)]
               [NEXT-TOKEN (lambda [] next-tok)])]]
       (lambda [string]
          (let [[lex-obj (create-lexan-object string)]]
              (get-token lex-obj)                        ; initialize
              lex-obj))))

;;; Not dealt with by this code:
;;; 
;;;  -- strings
;;;  -- charats
;;;  -- character quotation (%)

;;; LEXICAL-ANALYSIS:
;;; =========

(define LEXICAL-ANALYSIS
   (lambda [string]
      (let [[a (lexan-object string)]]
         (letrec [[helper (lambda []
                              (let [[tok (get-token a)]]
                                  (if (= tok 'eot)
                                      []
                                      (cons tok (helper)))))]]
            (helper)))))


;;; ===============================================================
;;;          INTERNAL ROUTINES
;;; ===============================================================

;;; GLOBAL-GET-TOKEN  takes a STRING and returns a rail of:
;;; ================  a) the prefix of the string corresponding to the
;;;                      first token in the string
;;;                   b) the rest of the string after the prefix
;;; In case there are no tokens left in the string, returns the
;;; handle 'eot (end of tokens).
;;; 
;;; Used by GET-TOKEN (within INIT-LEXAN)

(define GLOBAL-GET-TOKEN   
   (lambda [string]
      (add-to-token "" (strip-leading-whitespace string))))

;;; ADD-TO-TOKEN  auxiliary function for GLOBAL-GET-TOKEN which 
;;; ============  adds characters to a token found so far (SO-FAR)
;;;               taking characters from a string STRING.
;;; 
;;; Uses information from global-variables:   *special-chars*
;;; defined by INIT-LEXAN:                    *delimiters*

(define ADD-TO-TOKEN
   (lambda [so-far string]
      (cond [(string-null string)
             [(if (string-null so-far) 'eot so-far) string]]
            [(and (string-null so-far)
                  (member (string-first string) *special-chars*))
             [(substring 1 1 string) (string-rest string)]]
            [(member (string-first string) *delimiters*)
             [so-far string]]
            [$true
             (add-to-token (string-append so-far
                                          (substring 1 1 string))
                           (string-rest string))])))

;;; STRIP-LEADING-WHITESPACE  removes all leading white-space and comments
;;; ========================  from a string.  White-space is defined as
;;;                           any character in the global variable
;;;                           *white-space-chars* (defined by INIT-LEXAN).
;;; Comments are any string of characters starting with the character
;;; bound to *comment-begin*, up through the character *comment-end*

(define STRIP-LEADING-WHITESPACE
   (lambda [string]
      (cond [(string-null string) string]
            [(member (string-first string) *white-space-chars*)
             (strip-leading-whitespace (string-rest string))]
            [(= (string-first string) *comment-begin*)
             (strip-leading-whitespace (strip-comment string))]
            [$true
             string])))

;;; STRIP-COMMENT  Removes one full comment prefix from a string.  A comment
;;; =============  is defined as any string of characters starting with the
;;;                globally bound *comment-begin* character, up through and
;;;                including *comment-end* (both defined by INIT-LEXAN).

(define STRIP-COMMENT
   (lambda [string]
      (cond [(string-null string) string]
            [(= (string-first string) *comment-end*)
             (string-rest string)]
            [$true
             (strip-comment (string-rest string))])))

;;; ===============================================================
;;;          SPECIAL TOKEN IDENTIFICATION PREDICATES
;;; ===============================================================

(define IDENTIFIER-EXPRESSION
   (lambda [token]
      (and (every-character
               (lambda [ch] (not (member ch *special-chars*)))
               token)
           (not (every-character
                   (lambda [ch] (member ch *digits*))
                   token)))))

(define NUMERAL-EXPRESSION
   (lambda [token]
      (every-character (lambda [ch] (member ch *digits*))
                       (if (member (nth-char 1 token) [#+ #-])
                           (string-rest token)
                           token))))

;;; ===============================================================
;;;          STRING MANIPULATION ROUTINES
;;; ===============================================================

(define STRING-NULL
   (lambda [string] (= string "")))

(define STRING-FIRST
   (lambda [string] (nth-char 1 string)))

(define STRING-REST
   (lambda [string] (substring 2 (string-length string) string)))

;;; (EVERY-CHARACTER predicate string)   True just in case predicate
;;; ==================================   is true of each character in
;;;                                      string.

(define EVERY-CHARACTER
   (lambda [predicate string]
      (cond [(string-null string) $true]
            [(predicate (string-first string))
             (every-character predicate (string-rest string))]
            [$true $false])))


(init-lexan)