A mostly-complete scanner!

This commit is contained in:
Scott Richmond 2022-02-06 18:20:55 -05:00
parent bbe20d4af1
commit c3426624c9

View File

@ -1,11 +1,15 @@
(ns ludus.scanner (ns ludus.scanner
(:require [ludus.token :as token] (:require
[clojure.pprint :as pp])) [ludus.token :as token]
[clojure.pprint :as pp]
[clojure.edn :as edn]
[clojure.string :as s]))
(def reserved-words (def reserved-words
"List of Ludus reserved words." "List of Ludus reserved words."
;; see ludus-spec repo for more info ;; see ludus-spec repo for more info
#{ #{
"as"
"cond" "cond"
"else" "else"
"false" "false"
@ -18,13 +22,17 @@
"then" "then"
"true" "true"
"var" "var"
"with" ;;maybe "with"
;; below here, probable ;; below here, probable
"defer" "defer"
"gen"
"loop" "loop"
"ns" "ns"
"recur" "recur"
"repeat" "repeat"
"test"
"wait"
"yield"
;; below here, possible ;; below here, possible
"when" "when"
}) })
@ -77,10 +85,11 @@
(char-in-range? \1 \9 c)) (char-in-range? \1 \9 c))
;; for now, use very basic ASCII charset in words ;; for now, use very basic ASCII charset in words
;; TODO: research the implications of using the whole
;; (defn- alpha? [c] (boolean (re-find #"\p{L}" (str c))))
(defn- alpha? [c] (defn- alpha? [c]
(or (char-in-range? \a \z c) (char-in-range? \A \Z c))) (or (char-in-range? \a \z c) (char-in-range? \A \Z c)))
;; (defn- alpha? [c] (boolean (re-find #"\p{L}" (str c))))
;; legal characters in words ;; legal characters in words
(def word-chars #{\_ \? \! \* \/}) (def word-chars #{\_ \? \! \* \/})
@ -91,7 +100,7 @@
(defn- whitespace? [c] (defn- whitespace? [c]
(or (= c \space) (= c \tab))) (or (= c \space) (= c \tab)))
(def terminators #{\: \; \newline \{ \} \( \) \[ \] \$ \# \- \< \& \, \|}) (def terminators #{\: \; \newline \{ \} \( \) \[ \] \$ \# \- \< \& \, \| nil \\})
(defn- terminates? [c] (defn- terminates? [c]
(or (whitespace? c) (contains? terminators c))) (or (whitespace? c) (contains? terminators c)))
@ -109,29 +118,39 @@
(::start scanner))))) (::start scanner)))))
;; TODO: errors should also be in the vector of tokens ;; TODO: errors should also be in the vector of tokens
;; The goal is to be able to be able to hand this to an LSP ;; The goal is to be able to be able to hand this to an LSP?
;; Do we need a different structure
(defn- add-error [scanner msg] (defn- add-error [scanner msg]
(update scanner ::errors conj {:msg msg :line (::line scanner) :start (::start scanner)})) (update scanner ::errors conj {:msg msg :line (::line scanner) :start (::start scanner)}))
;; TODO: finish this (defn- add-keyword
(defn- scan-keyword [scanner]
([scanner] (scan-keyword scanner scanner)) (loop [scanner scanner
([start current])) key ""]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner ::token/keyword (keyword key))
(word-char? char) (recur (advance scanner) (str key char))
:else (add-error scanner "Unexpected " char "after keyword :" key)))))
(defn- add-keyword [scanner] ;; TODO: improve number parsing?
(let [advanced (advance scanner) ;; This will currently parse 000001 as 1. Is that correct behaviour?
char (current-char advanced)] (defn- add-number [char scanner]
(if (not (alpha? char)) (loop [scanner scanner
(add-error scanner (str "Keywords must start with a letter, e.g. :foo. Got " \: char)) num (str char)
(scan-keyword advanced)))) float? false]
(let [curr (current-char scanner)]
(defn- add-zero-start [scanner]) (cond
(= curr \_) (recur (advance scanner) num float?) ;; consume underscores unharmed
(defn- add-number [scanner] (= curr \.) (if float?
(let [current (current-char scanner)] (add-error scanner (str "Unexpected second decimal point after " num "."))
(if (nonzero-digit? current) (recur (advance scanner) (str num curr) true))
(loop [current current])))) (terminates? curr) (add-token scanner ::token/number (edn/read-string num))
(digit? curr) (recur (advance scanner) (str num curr) float?)
:else (add-error scanner (str "Unexpected " curr " after number " num "."))))))
;; TODO: add string interpolation
;; This still has to be devised
(defn- add-string (defn- add-string
[scanner] [scanner]
(loop [scanner scanner (loop [scanner scanner
@ -140,27 +159,51 @@
(case char (case char
\newline (add-error scanner "Unterminated string.") \newline (add-error scanner "Unterminated string.")
\" (add-token (advance scanner) ::token/string string) \" (add-token (advance scanner) ::token/string string)
\\ (recur (advance (advance scanner)) (str string (next-char scanner))) \\ (let [next (next-char scanner)
scanner (if (= next \newline)
(update scanner ::line inc)
scanner)]
(recur (advance (advance scanner)) (str string next)))
(if (at-end? scanner) (if (at-end? scanner)
(add-error scanner "Unterminated string.") (add-error scanner "Unterminated string.")
(recur (advance scanner) (str string char))))))) (recur (advance scanner) (str string char)))))))
(defn- add-word (defn- add-word
[char scanner]
(loop [scanner scanner
word (str char)]
(let [curr (current-char scanner)]
(cond
(terminates? curr) (if (contains? reserved-words word)
(add-token scanner ::token/reserved)
(add-token scanner ::token/word))
(word-char? curr) (recur (advance scanner) (str word curr))
:else (add-error scanner (str "Unexpected " curr " after word " word "."))))))
(defn- add-ignored
[scanner] [scanner]
(loop [scanner scanner (loop [scanner scanner
word ""]) ignored "_"]
(let [char (current-char scanner)])) (let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner ::token/ignored)
(word-char? char) (recur (advance scanner) (str ignored char))
:else (add-error scanner (str "Unexpected " char " after word " ignored "."))))))
(defn- skip-comment [scanner] (defn- add-comment [char scanner]
(if (= \newline (current-char scanner)) (loop [scanner scanner
(advance scanner) comm (str char)]
(recur (advance scanner)))) (let [char (current-char scanner)]
(if (= \newline char)
(if (s/starts-with? comm "&&&")
(add-token (update scanner ::line inc) ::token/docstring)
(add-token (update scanner ::line inc) ::token/comment))
(recur (advance scanner) (str comm char))))))
(defn- scan-token [scanner] (defn- scan-token [scanner]
(let [char (current-char scanner) (let [char (current-char scanner)
scanner (advance scanner) scanner (advance scanner)
next (current-char scanner) next (current-char scanner)]
]
(case char (case char
;; one-character tokens ;; one-character tokens
\( (add-token scanner ::token/lparen) \( (add-token scanner ::token/lparen)
@ -171,15 +214,15 @@
\] (add-token scanner ::token/rbracket) \] (add-token scanner ::token/rbracket)
\; (add-token scanner ::token/semicolon) \; (add-token scanner ::token/semicolon)
\, (add-token scanner ::token/comma) \, (add-token scanner ::token/comma)
\newline (add-token scanner ::token/newline) \newline (add-token (update scanner ::line inc) ::token/newline)
\\ (add-token scanner ::token/backslash) \\ (add-token scanner ::token/backslash)
;; two-character tokens ;; two-character tokens
;; -> ;; ->
\- (cond \- (cond
(= next \>) (add-token (advance scanner) ::token/rarrow) (= next \>) (add-token (advance scanner) ::token/rarrow)
(digit? next) (add-number scanner) (digit? next) (add-number char scanner)
(add-error scanner ("Expected -> or negative number. Got " char next))) :else (add-error scanner (str "Expected -> or negative number. Got " char next)))
;; <- ;; <-
\< (if (= next \-) \< (if (= next \-)
@ -206,34 +249,45 @@
(add-token (advance scanner) ::token/startset) (add-token (advance scanner) ::token/startset)
(add-error scanner (str "Expected beginning of set: ${. Got " char next))) (add-error scanner (str "Expected beginning of set: ${. Got " char next)))
;; placeholder ;; placeholders
;; TODO: add named placeholder ;; there's a flat _, and then ignored words
\_ (if (terminates? next) \_ (cond
(add-token scanner ::token/placeholder) (terminates? next) (add-token scanner ::token/placeholder)
(add-word scanner)) (alpha? next) (add-ignored scanner)
:else (add-error scanner (str "Expected placeholder: _. Got " char next)))
;; comments ;; comments
;; & starts an inline comment ;; & starts an inline comment
;; TODO: include comments in scanned file ;; TODO: include comments in scanned file
;; TODO: add doc comments: &&& ;; TODO: add doc comments: &&&
\& (skip-comment scanner) \& (add-comment char scanner)
;; keywords ;; keywords
;; TODO: instead of a separate token, scan a whole type keyword ;; TODO: instead of a separate token, scan a whole type keyword
;; e.g. ::string, ::number ;; e.g. ::string, ::number
\: (cond \: (cond
;;(= \: next) (add-token (advance scanner) ::token/doublecolon)) ;;(= \: next) (add-token (advance scanner) ::token/doublecolon))
(alpha? next) (add-word scanner) (alpha? next) (add-keyword scanner)
:else (add-error scanner (str "Expected keyword. Got " char next)) :else (add-error scanner (str "Expected keyword. Got " char next)))
;; splats
\. (let [after_next (current-char (advance scanner))]
(if (= ".." (str next after_next))
(add-token (advance (advance scanner)) ::token/splat)
(add-error scanner (str "Expected splat: ... . Got " (str "." next after_next)))))
;; strings ;; strings
\" (add-string scanner) \" (add-string scanner)
;; word matches ;; word matches
(cond (cond
(whitespace? char) scanner ;; TODO: include whitespace in scan (whitespace? char) (loop [scanner scanner ws (str char)]
;; (digit? char) (add-number scanner) (let [curr (current-char scanner)]
;; (alpha? char) (add-word scanner) (if (whitespace? curr)
(recur (advance scanner) (str ws curr))
(add-token scanner ::token/ws))))
(digit? char) (add-number char scanner)
(alpha? char) (add-word char scanner)
:else (add-error scanner (str "Unexpected character: " char)))))) :else (add-error scanner (str "Unexpected character: " char))))))
(defn- next-token [scanner] (defn- next-token [scanner]
@ -242,18 +296,24 @@
(defn scan [source] (defn scan [source]
(loop [scanner (new-scanner source)] (loop [scanner (new-scanner source)]
(if (at-end? scanner) (if (at-end? scanner)
(let [scanner (add-token scanner ::eof)] (let [scanner (add-token scanner ::token/eof)]
{:tokens (::tokens scanner) {:tokens (::tokens scanner)
:errors (::errors scanner)}) :errors (::errors scanner)})
(recur (-> scanner (scan-token) (next-token)))))) (recur (-> scanner (scan-token) (next-token))))))
(let [source "\"foo\\\nbar\"\n)"] (let [source "&&1 abc qde\n\n"]
(scan source)) (scan source))
;; string scanning is (I think) working
;; line counting is not working
;; do I just save a location and then calculate line numbers if an error happens?
;; next up: numbers!