File [phylum]<3-LISP>course>UTILITIES>LEXICAL-ANALYSER.3-LISP!9


;;;            ======================== 
;;;            3-LISP Lexical Analyser:
;;;            ========================

;;; INIT-LEX  Initializes all tables.
;;; ========
;;;     
;;; SETUP-LEX    Sets up functions for retrieving tokens from a given string
;;; =========    STRING.  It makes available the following ancillary functions:
;;;     
;;; CURRENT-TOKEN  Returns the token currently being scanned.
;;; =============
;;;
;;; GET-TOKEN  Also returns the current token being scanned but 
;;; =========  moves past the token in the input.  This is the normal
;;;            function to be used in accessing tokens.
;;;
;;; NEXT-TOKEN  Returns the 'look-ahead' token, the token just about to
;;; ==========  be scanned.  Useful for dispatching without actually
;;;             reading the token.

(define INIT-LEX
   (lambda []
      (set *special-chars* [ #( #) #[ #] #$ #' #, #. #↑ #\ #{ #} ])
      (set *white-space-chars* [ #  cr ])
      (set *delimiters* (append *white-space-chars* *special-chars*))
      (set *digits* [ #0 #1 #2 #3 #4 #5 #6 #7 #8 #9 ])
      (set *comment-begin* #;)
      (set *comment-end* cr)
      "ok"))


(define SETUP-LEX
   (lambda [string]
      (let [[current-tok ""]
            [next-tok ""]]
         (define GET-TOKEN
            (lambda []
               (let [[token-info (global-get-token string)]]
                   (set string (second token-info))
                   (set current-tok next-tok)
                   (set next-tok (first token-info))
                   current-tok)))
         (define CURRENT-TOKEN
            (lambda [] current-tok))
         (define NEXT-TOKEN
            (lambda [] next-tok))  
         (get-token))))                 

;;; Still to be deal with:
;;; 
;;;  -- strings
;;;  -- charats
;;;  -- character quotation (%)

;;; GLOBAL-GET-TOKEN  takes a STRING and returns a rail of:
;;; ================  a) the prefix of the string corresponding to the
;;;                      first token in the string
;;;                   b) the rest of the string after the prefix
;;; In case there are no tokens left in the string, returns the
;;; handle 'eot (end of tokens).
;;; Used by GET-TOKEN (within INIT-LEX)

(define GLOBAL-GET-TOKEN   
   (lambda [string]
      (add-to-token "" (strip-leading-whitespace string))))

;;; ADD-TO-TOKEN  auxiliary function for GLOBAL-GET-TOKEN which 
;;; ============  adds characters to a token found so far (SO-FAR)
;;;               taking characters from a string STRING.
;;; The function uses information from global-variables:
;;;    *special-chars* and *delimiters* defined by INIT-LEX

(define ADD-TO-TOKEN
   (lambda [so-far string]
      (cond [(string-null string)
             [(if (string-null so-far) 'eot so-far) string]]
            [(= (string-first string) *comment-begin*)
             (add-to-token so-far (strip-comment string))]
            [(and (string-null so-far)
                  (member (string-first string) *special-chars*))
             [(substring 1 1 string) (string-rest string)]]
            [(member (string-first string) *delimiters*)
             [so-far string]]
            [$true
             (add-to-token (string-append so-far
                                          (substring 1 1 string))
                           (string-rest string))])))

;;; STRIP-LEADING-WHITESPACE  removes all leading white-space characters
;;; ========================  from a string.  White-space is defined as
;;;                           any character in the global variable
;;;                           *white-space-chars* (defined by INIT-LEX).

(define STRIP-LEADING-WHITESPACE
   (lambda [string]
      (cond [(string-null string) string]
            [(member (string-first string) *white-space-chars*)
             (strip-leading-whitespace (string-rest string))]
            [(= (string-first string) *comment-begin*)
             (strip-leading-whitespace (strip-comment string))]
            [$true
             string])))

;;; STRIP-COMMENT  Removes one full comment prefix from a string.  A comment
;;; =============  is defined as any string of characters starting with the
;;;                globally bound *comment-begin* character, up through and
;;;                including *comment-end* (both defined by INIT-LEX).

(define STRIP-COMMENT
   (lambda [string]
      (cond [(string-null string) string]
            [(= (string-first string) *comment-end*)
             (string-rest string)]
            [$true
             (strip-comment (string-rest string))])))


;;; ===============================================================
;;;          AUXILIARY STRING MANIPULATION ROUTINES
;;; ===============================================================

(define STRING-NULL
   (lambda [string] (= string "")))

(define STRING-FIRST
   (lambda [string] (nth-char 1 string)))

(define STRING-REST
   (lambda [string] (substring 2 (string-length string) string)))


;;; ===============================================================
;;;               TESTING and DEBUGGING ROUTINES
;;; ===============================================================


(define tokenize
   (lambda [string]
      (init-lex)
      (setup-lex string)
      (letrec [[collect (lambda []
                           (let [[tok (get-token)]]
                               (if (= tok 'eot)
                                   []
                                   (cons tok (collect)))))]]
         (collect))))