;;; Code stored as: {turing}<3-lisp.problem-sets>lexical-analyser.3-lisp
;;; {phylum}<3-lisp>course>problem-sets>lexical-analyser.3-lisp
;;; ========================
;;; 3-LISP Lexical Analyser:
;;; ========================
;;; ===============================================================
;;; PUBLIC ROUTINES
;;; ===============================================================
;;; (INIT-LEXAN) Initializes all tables.
;;; ============
;;;
;;; (LEXAN-OBJECT string) A type of object that embodies the tokenization of a
;;; ===================== given string. Supports the following messages:
;;;
;;; (CURRENT-TOKEN lexan-object) Returns the token currently being scanned (i.e.,.
;;; ============================ the one made current by the last use of GET-TOKEN)
;;;
;;; (GET-TOKEN lexan-object) Also returns the current token being scanned but
;;; ======================== moves past the token in the input. This is the
;;; normal method to be used in accessing tokens.
;;;
;;; (NEXT-TOKEN lexan-object) Returns the 'look-ahead' token, the token just
;;; ========================= about to be scanned. Useful for dispatching without
;;; actually reading the token.
;;;
;;; (LEXICAL-ANALYSIS string) Designates the sequence of tokens into which
;;; ========================= string is lexically analyzed. For example
;;; (lexical-analysis "a (b)") => ["a" "(" "b" ")"].
;;;
;;; (IDENTIFIER-EXPRESSION token) True just in case token is a sequence of alphanumeric
;;; ============================= characters, but is not a numeral-expression.
;;;
;;; (NUMERAL-EXPRESSION token) True just in case token is a sequence of digits,
;;; ========================== possibly including a leading "+" or "-".
(define INIT-LEXAN
(lambda []
(set *special-chars* [ #( #) #[ #] #$ #' #, #. #↑ #\ #{ #} ])
(set *white-space-chars* [ # cr ])
(set *delimiters* (append *white-space-chars* *special-chars*))
(set *digits* [ #0 #1 #2 #3 #4 #5 #6 #7 #8 #9 ])
(set *comment-begin* #;)
(set *comment-end* cr)
"ok"))
(define LEXAN-OBJECT
(let [[create-lexan-object
(object [string]
[[current-tok ""]
[next-tok ""]]
[GET-TOKEN
(lambda []
(let [[token-info (global-get-token string)]]
(set string (second token-info))
(set current-tok next-tok)
(set next-tok (first token-info))
current-tok))]
[CURRENT-TOKEN (lambda [] current-tok)]
[NEXT-TOKEN (lambda [] next-tok)])]]
(lambda [string]
(let [[lex-obj (create-lexan-object string)]]
(get-token lex-obj) ; initialize
lex-obj))))
;;; Not dealt with by this code:
;;;
;;; -- strings
;;; -- charats
;;; -- character quotation (%)
;;; LEXICAL-ANALYSIS:
;;; =========
(define LEXICAL-ANALYSIS
(lambda [string]
(let [[a (lexan-object string)]]
(letrec [[helper (lambda []
(let [[tok (get-token a)]]
(if (= tok 'eot)
[]
(cons tok (helper)))))]]
(helper)))))
;;; ===============================================================
;;; INTERNAL ROUTINES
;;; ===============================================================
;;; GLOBAL-GET-TOKEN takes a STRING and returns a rail of:
;;; ================ a) the prefix of the string corresponding to the
;;; first token in the string
;;; b) the rest of the string after the prefix
;;; In case there are no tokens left in the string, returns the
;;; handle 'eot (end of tokens).
;;;
;;; Used by GET-TOKEN (within INIT-LEXAN)
(define GLOBAL-GET-TOKEN
(lambda [string]
(add-to-token "" (strip-leading-whitespace string))))
;;; ADD-TO-TOKEN auxiliary function for GLOBAL-GET-TOKEN which
;;; ============ adds characters to a token found so far (SO-FAR)
;;; taking characters from a string STRING.
;;;
;;; Uses information from global-variables: *special-chars*
;;; defined by INIT-LEXAN: *delimiters*
(define ADD-TO-TOKEN
(lambda [so-far string]
(cond [(string-null string)
[(if (string-null so-far) 'eot so-far) string]]
[(and (string-null so-far)
(member (string-first string) *special-chars*))
[(substring 1 1 string) (string-rest string)]]
[(member (string-first string) *delimiters*)
[so-far string]]
[$true
(add-to-token (string-append so-far
(substring 1 1 string))
(string-rest string))])))
;;; STRIP-LEADING-WHITESPACE removes all leading white-space and comments
;;; ======================== from a string. White-space is defined as
;;; any character in the global variable
;;; *white-space-chars* (defined by INIT-LEXAN).
;;; Comments are any string of characters starting with the character
;;; bound to *comment-begin*, up through the character *comment-end*
(define STRIP-LEADING-WHITESPACE
(lambda [string]
(cond [(string-null string) string]
[(member (string-first string) *white-space-chars*)
(strip-leading-whitespace (string-rest string))]
[(= (string-first string) *comment-begin*)
(strip-leading-whitespace (strip-comment string))]
[$true
string])))
;;; STRIP-COMMENT Removes one full comment prefix from a string. A comment
;;; ============= is defined as any string of characters starting with the
;;; globally bound *comment-begin* character, up through and
;;; including *comment-end* (both defined by INIT-LEXAN).
(define STRIP-COMMENT
(lambda [string]
(cond [(string-null string) string]
[(= (string-first string) *comment-end*)
(string-rest string)]
[$true
(strip-comment (string-rest string))])))
;;; ===============================================================
;;; SPECIAL TOKEN IDENTIFICATION PREDICATES
;;; ===============================================================
(define IDENTIFIER-EXPRESSION
(lambda [token]
(and (every-character
(lambda [ch] (not (member ch *special-chars*)))
token)
(not (every-character
(lambda [ch] (member ch *digits*))
token)))))
(define NUMERAL-EXPRESSION
(lambda [token]
(every-character (lambda [ch] (member ch *digits*))
(if (member (nth-char 1 token) [#+ #-])
(string-rest token)
token))))
;;; ===============================================================
;;; STRING MANIPULATION ROUTINES
;;; ===============================================================
(define STRING-NULL
(lambda [string] (= string "")))
(define STRING-FIRST
(lambda [string] (nth-char 1 string)))
(define STRING-REST
(lambda [string] (substring 2 (string-length string) string)))
;;; (EVERY-CHARACTER predicate string) True just in case predicate
;;; ================================== is true of each character in
;;; string.
(define EVERY-CHARACTER
(lambda [predicate string]
(cond [(string-null string) $true]
[(predicate (string-first string))
(every-character predicate (string-rest string))]
[$true $false])))
(init-lexan)