File [phylum]<3-LISP>course>UTILITIES>NEW-LEXICAL-ANALYSER.3-LISP!7


;;;            ======================== 
;;;            3-LISP Lexical Analyser:
;;;            ========================

;;; INIT-LEX  Initializes all tables.
;;; ========
;;;     
;;; TOKEN-LIST   A type of object that embodies the tokenization of a 
;;; ==========   given string.  Allows the following messages:
;;;     
;;; CURRENT-TOKEN  Returns the token currently being scanned.
;;; =============
;;;
;;; GET-TOKEN  Also returns the current token being scanned but 
;;; =========  moves past the token in the input.  This is the normal
;;;            function to be used in accessing tokens.  It should be called
;;; once upon setting up a token-list object to initialize the lookahead token
;;; (next-tok).
;;;
;;; NEXT-TOKEN  Returns the 'look-ahead' token, the token just about to
;;; ==========  be scanned.  Useful for dispatching without actually
;;;             reading the token.

(define INIT-LEX
   (lambda []
      (set *special-chars* [ #( #) #[ #] #$ #' #, #. #↑ #\ #{ #} ])
      (set *white-space-chars* [ #  cr ])
      (set *delimiters* (append *white-space-chars* *special-chars*))
      (set *digits* [ #0 #1 #2 #3 #4 #5 #6 #7 #8 #9 ])
      (set *comment-begin* #;)
      (set *comment-end* cr)
      "ok"))

;;; Assumes <3-lisp>course>utilities>objects.3-lisp

(define TOKEN-LIST
   (object [string]
           [[current-tok ""]
            [next-tok ""]]
      [GET-TOKEN
         (lambda []
            (let [[token-info (global-get-token string)]]
               (set string (second token-info))
               (set current-tok next-tok)
               (set next-tok (first token-info))
               current-tok))]
      [CURRENT-TOKEN (lambda [] current-tok)]
      [NEXT-TOKEN (lambda [] next-tok)]))                 

;;; Still to be dealt with:
;;; 
;;;  -- strings
;;;  -- charats
;;;  -- character quotation (%)

;;; GLOBAL-GET-TOKEN  takes a STRING and returns a rail of:
;;; ================  a) the prefix of the string corresponding to the
;;;                      first token in the string
;;;                   b) the rest of the string after the prefix
;;; In case there are no tokens left in the string, returns the
;;; handle 'eot (end of tokens).
;;; Used by GET-TOKEN (within INIT-LEX)

(define GLOBAL-GET-TOKEN   
   (lambda [string]
      (add-to-token "" (strip-leading-whitespace string))))

;;; ADD-TO-TOKEN  auxiliary function for GLOBAL-GET-TOKEN which 
;;; ============  adds characters to a token found so far (SO-FAR)
;;;               taking characters from a string STRING.
;;; The function uses information from global-variables:
;;;    *special-chars* and *delimiters* defined by INIT-LEX

(define ADD-TO-TOKEN
   (lambda [so-far string]
      (cond [(string-null string)
             [(if (string-null so-far) 'eot so-far) string]]
            [(and (string-null so-far)
                  (member (string-first string) *special-chars*))
             [(substring 1 1 string) (string-rest string)]]
            [(member (string-first string) *delimiters*)
             [so-far string]]
            [$true
             (add-to-token (string-append so-far
                                          (substring 1 1 string))
                           (string-rest string))])))

;;; STRIP-LEADING-WHITESPACE  removes all leading white-space characters
;;; ========================  from a string.  White-space is defined as
;;;                           any character in the global variable
;;;                           *white-space-chars* (defined by INIT-LEX).

(define STRIP-LEADING-WHITESPACE
   (lambda [string]
      (cond [(string-null string) string]
            [(member (string-first string) *white-space-chars*)
             (strip-leading-whitespace (string-rest string))]
            [(= (string-first string) *comment-begin*)
             (strip-leading-whitespace (strip-comment string))]
            [$true
             string])))

;;; STRIP-COMMENT  Removes one full comment prefix from a string.  A comment
;;; =============  is defined as any string of characters starting with the
;;;                globally bound *comment-begin* character, up through and
;;;                including *comment-end* (both defined by INIT-LEX).

(define STRIP-COMMENT
   (lambda [string]
      (cond [(string-null string) string]
            [(= (string-first string) *comment-end*)
             (string-rest string)]
            [$true
             (strip-comment (string-rest string))])))


;;; ===============================================================
;;;          AUXILIARY STRING MANIPULATION ROUTINES
;;; ===============================================================

(define STRING-NULL
   (lambda [string] (= string "")))

(define STRING-FIRST
   (lambda [string] (nth-char 1 string)))

(define STRING-REST
   (lambda [string] (substring 2 (string-length string) string)))


;;; TOKENIZE:
;;; =========

(define tokenize
   (lambda [string]
      (init-lex)
      (let [[a (token-list string)]]
         (get-token a)
         (letrec [[collect (lambda []
                              (let [[tok (get-token a)]]
                                  (if (= tok 'eot)
                                      []
                                      (cons tok (collect)))))]]
            (collect)))))