;;; Code stored as: {turing}<3-lisp.problem-sets>lexical-analyser.3-lisp ;;; {phylum}<3-lisp>course>problem-sets>lexical-analyser.3-lisp ;;; ======================== ;;; 3-LISP Lexical Analyser: ;;; ======================== ;;; =============================================================== ;;; PUBLIC ROUTINES ;;; =============================================================== ;;; (INIT-LEXAN) Initializes all tables. ;;; ============ ;;; ;;; (LEXAN-OBJECT string) A type of object that embodies the tokenization of a ;;; ===================== given string. Supports the following messages: ;;; ;;; (CURRENT-TOKEN lexan-object) Returns the token currently being scanned (i.e.,. ;;; ============================ the one made current by the last use of GET-TOKEN) ;;; ;;; (GET-TOKEN lexan-object) Also returns the current token being scanned but ;;; ======================== moves past the token in the input. This is the ;;; normal method to be used in accessing tokens. ;;; ;;; (NEXT-TOKEN lexan-object) Returns the 'look-ahead' token, the token just ;;; ========================= about to be scanned. Useful for dispatching without ;;; actually reading the token. ;;; ;;; (LEXICAL-ANALYSIS string) Designates the sequence of tokens into which ;;; ========================= string is lexically analyzed. For example ;;; (lexical-analysis "a (b)") => ["a" "(" "b" ")"]. ;;; ;;; (IDENTIFIER-EXPRESSION token) True just in case token is a sequence of alphanumeric ;;; ============================= characters, but is not a numeral-expression. ;;; ;;; (NUMERAL-EXPRESSION token) True just in case token is a sequence of digits, ;;; ========================== possibly including a leading "+" or "-". (define INIT-LEXAN (lambda [] (set *special-chars* [ #( #) #[ #] #$ #' #, #. #^ #\ #{ #} ]) (set *white-space-chars* [ # cr ]) (set *delimiters* (append *white-space-chars* *special-chars*)) (set *digits* [ #0 #1 #2 #3 #4 #5 #6 #7 #8 #9 ]) (set *comment-begin* #;) (set *comment-end* cr) "ok")) (define LEXAN-OBJECT (let [[create-lexan-object (object [string] [[current-tok ""] [next-tok ""]] [GET-TOKEN (lambda [] (let [[token-info (global-get-token string)]] (set string (second token-info)) (set current-tok next-tok) (set next-tok (first token-info)) current-tok))] [CURRENT-TOKEN (lambda [] current-tok)] [NEXT-TOKEN (lambda [] next-tok)])]] (lambda [string] (let [[lex-obj (create-lexan-object string)]] (get-token lex-obj) ; initialize lex-obj)))) ;;; Not dealt with by this code: ;;; ;;; -- strings ;;; -- charats ;;; -- character quotation (%) ;;; LEXICAL-ANALYSIS: ;;; ========= (define LEXICAL-ANALYSIS (lambda [string] (let [[a (lexan-object string)]] (letrec [[helper (lambda [] (let [[tok (get-token a)]] (if (= tok 'eot) [] (cons tok (helper)))))]] (helper))))) ;;; =============================================================== ;;; INTERNAL ROUTINES ;;; =============================================================== ;;; GLOBAL-GET-TOKEN takes a STRING and returns a rail of: ;;; ================ a) the prefix of the string corresponding to the ;;; first token in the string ;;; b) the rest of the string after the prefix ;;; In case there are no tokens left in the string, returns the ;;; handle 'eot (end of tokens). ;;; ;;; Used by GET-TOKEN (within INIT-LEXAN) (define GLOBAL-GET-TOKEN (lambda [string] (add-to-token "" (strip-leading-whitespace string)))) ;;; ADD-TO-TOKEN auxiliary function for GLOBAL-GET-TOKEN which ;;; ============ adds characters to a token found so far (SO-FAR) ;;; taking characters from a string STRING. ;;; ;;; Uses information from global-variables: *special-chars* ;;; defined by INIT-LEXAN: *delimiters* (define ADD-TO-TOKEN (lambda [so-far string] (cond [(string-null string) [(if (string-null so-far) 'eot so-far) string]] [(and (string-null so-far) (member (string-first string) *special-chars*)) [(substring 1 1 string) (string-rest string)]] [(member (string-first string) *delimiters*) [so-far string]] [$true (add-to-token (string-append so-far (substring 1 1 string)) (string-rest string))]))) ;;; STRIP-LEADING-WHITESPACE removes all leading white-space and comments ;;; ======================== from a string. White-space is defined as ;;; any character in the global variable ;;; *white-space-chars* (defined by INIT-LEXAN). ;;; Comments are any string of characters starting with the character ;;; bound to *comment-begin*, up through the character *comment-end* (define STRIP-LEADING-WHITESPACE (lambda [string] (cond [(string-null string) string] [(member (string-first string) *white-space-chars*) (strip-leading-whitespace (string-rest string))] [(= (string-first string) *comment-begin*) (strip-leading-whitespace (strip-comment string))] [$true string]))) ;;; STRIP-COMMENT Removes one full comment prefix from a string. A comment ;;; ============= is defined as any string of characters starting with the ;;; globally bound *comment-begin* character, up through and ;;; including *comment-end* (both defined by INIT-LEXAN). (define STRIP-COMMENT (lambda [string] (cond [(string-null string) string] [(= (string-first string) *comment-end*) (string-rest string)] [$true (strip-comment (string-rest string))]))) ;;; =============================================================== ;;; SPECIAL TOKEN IDENTIFICATION PREDICATES ;;; =============================================================== (define IDENTIFIER-EXPRESSION (lambda [token] (and (every-character (lambda [ch] (not (member ch *special-chars*))) token) (not (every-character (lambda [ch] (member ch *digits*)) token))))) (define NUMERAL-EXPRESSION (lambda [token] (every-character (lambda [ch] (member ch *digits*)) (if (member (nth-char 1 token) [#+ #-]) (string-rest token) token)))) ;;; =============================================================== ;;; STRING MANIPULATION ROUTINES ;;; =============================================================== (define STRING-NULL (lambda [string] (= string ""))) (define STRING-FIRST (lambda [string] (nth-char 1 string))) (define STRING-REST (lambda [string] (substring 2 (string-length string) string))) ;;; (EVERY-CHARACTER predicate string) True just in case predicate ;;; ================================== is true of each character in ;;; string. (define EVERY-CHARACTER (lambda [predicate string] (cond [(string-null string) $true] [(predicate (string-first string)) (every-character predicate (string-rest string))] [$true $false]))) (init-lexan)