ludus/src/ludus/scanner.clj
2023-06-01 17:27:55 -06:00

327 lines
10 KiB
Clojure

(ns ludus.scanner
(:require
[ludus.token :as token]
;; [clojure.pprint :as pp]
[clojure.edn :as edn]))
(def reserved-words
"List of Ludus reserved words."
;; see ludus-spec repo for more info
{"as" :as ;; impl for `import`; not yet for patterns
;"cond" :cond ;; impl
"do" :do ;; impl
"else" :else ;; impl
"false" :false ;; impl -> literal word
"fn" :fn ;; impl
"if" :if ;; impl
"import" :import ;; impl
"let" :let ;; impl
"loop" :loop ;; impl
; "match" :match ;; impl
"nil" :nil ;; impl -> literal word
"ns" :ns ;; impl
;; "panic!" :panic ;; impl (should be a function)
"recur" :recur ;; impl
"ref" :ref ;; impl
"then" :then ;; impl
"true" :true ;; impl -> literal word
"with" :with ;; impl
;; actor model/concurrency
"receive" :receive
;;"self" :self ;; maybe not necessary?: self() is a function
;;"send" :send ;; not necessary: send(pid, message) is a function
"spawn" :spawn
;;"to" :to ;; not necessary if send is a function
;; type system
;; "data" :data ;; we are going to tear out datatypes for now: see if dynamism works for us
;; others
;;"repeat" :repeat ;; syntax sugar over "loop": still unclear what this syntax could be
"test" :test
"when" :when
;; "module" :module ;; not necessary if we don't have datatypes
"is" :is
})
(def literal-words {
"true" true
"false" false
"nil" nil
})
(defn- new-scanner
"Creates a new scanner."
[source]
{:source source
:length (count source)
:errors []
:start 0
:current 0
:line 1
:tokens []})
(defn- at-end?
"Tests if a scanner is at end of input."
[scanner]
(>= (:current scanner) (:length scanner)))
(defn- current-char
"Gets the current character of the scanner."
[scanner]
(nth (:source scanner) (:current scanner) nil))
(defn- advance
"Advances the scanner by a single character."
[scanner]
(update scanner :current inc))
(defn- next-char
"Gets the next character from the scanner."
[scanner]
(current-char (advance scanner)))
(defn- current-lexeme
[scanner]
(subs (:source scanner) (:start scanner) (:current scanner)))
(defn- char-in-range? [start end char]
(and char
(>= (int char) (int start))
(<= (int char) (int end))))
(defn- digit? [c]
(char-in-range? \0 \9 c))
(defn- nonzero-digit? [c]
(char-in-range? \1 \9 c))
;; for now, use very basic ASCII charset in words
;; TODO: research the implications of using the whole
;; (defn- alpha? [c] (boolean (re-find #"\p{L}" (str c))))
(defn- alpha? [c]
(or (char-in-range? \a \z c) (char-in-range? \A \Z c)))
(defn- lower? [c] (char-in-range? \a \z c))
(defn- upper? [c] (char-in-range? \A \Z c))
;; legal characters in words
(def word-chars #{\_ \? \! \* \/})
(defn- word-char? [c]
(or (alpha? c) (digit? c) (contains? word-chars c)))
(defn- whitespace? [c]
(or (= c \space) (= c \tab)))
(def terminators #{\: \; \newline \{ \} \( \) \[ \] \$ \# \- \= \& \, \> nil \\})
(defn- terminates? [c]
(or (whitespace? c) (contains? terminators c)))
(defn- add-token
([scanner token-type]
(add-token scanner token-type nil))
([scanner token-type literal]
(update scanner :tokens conj
(token/token
token-type
(current-lexeme scanner)
literal
(:line scanner)
(:start scanner)))))
;; TODO: errors should also be in the vector of tokens
;; The goal is to be able to be able to hand this to an LSP?
;; Do we need a different structure
(defn- add-error [scanner msg]
(let [token (token/token
:error
(current-lexeme scanner)
nil
(:line scanner)
(:start scanner))
err-token (assoc token :message msg)]
(-> scanner
(update :errors conj err-token)
(update :tokens conj err-token))))
(defn- add-keyword
[scanner]
(loop [scanner scanner
key ""]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner :keyword (keyword key))
(word-char? char) (recur (advance scanner) (str key char))
:else (add-error scanner (str "Unexpected " char "after keyword :" key))))))
;; TODO: improve number parsing?
;; Currently this uses Clojure's number formatting rules (since we use the EDN reader)
;; These rules are here: https://cljs.github.io/api/syntax/number
(defn- add-number [char scanner]
(loop [scanner scanner
num (str char)
float? false]
(let [curr (current-char scanner)]
(cond
(= curr \_) (recur (advance scanner) num float?) ;; consume underscores unharmed
(= curr \.) (if float?
(add-error scanner (str "Unexpected second decimal point after " num "."))
(recur (advance scanner) (str num curr) true))
(terminates? curr) (add-token scanner :number (edn/read-string num))
(digit? curr) (recur (advance scanner) (str num curr) float?)
:else (add-error scanner (str "Unexpected " curr " after number " num "."))))))
;; TODO: activate string interpolation
(defn- add-string
[scanner]
(loop [scanner scanner
string ""
interpolate? false]
(let [char (current-char scanner)]
(case char
\{ (recur (update (advance scanner)) (str string char) true)
; allow multiline strings
\newline (recur (update (advance scanner) :line inc) (str string char) interpolate?)
\" (if interpolate?
;(add-token (advance scanner) :interpolated string)
(add-token (advance scanner) :string string)
(add-token (advance scanner) :string string))
\\ (let [next (next-char scanner)
scanner (if (= next \newline)
(update scanner :line inc)
scanner)]
(recur (advance (advance scanner)) (str string next) interpolate?))
(if (at-end? scanner)
(add-error scanner "Unterminated string.")
(recur (advance scanner) (str string char) interpolate?))))))
(defn- add-word
[char scanner]
(loop [scanner scanner
word (str char)]
(let [curr (current-char scanner)]
(cond
(terminates? curr) (add-token scanner
(get reserved-words word :word)
(get literal-words word :none))
(word-char? curr) (recur (advance scanner) (str word curr))
:else (add-error scanner (str "Unexpected " curr " after word " word "."))))))
(defn- add-data
[char scanner]
(loop [scanner scanner
word (str char)]
(let [curr (current-char scanner)]
(cond
(terminates? curr) (add-token scanner :datatype)
(word-char? curr) (recur (advance scanner) (str word curr))
:else (add-error scanner (str "Unexpected " curr " after datatype " word "."))))))
(defn- add-ignored
[scanner]
(loop [scanner scanner
ignored "_"]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner :ignored)
(word-char? char) (recur (advance scanner) (str ignored char))
:else (add-error scanner (str "Unexpected " char " after word " ignored "."))))))
(defn- add-comment [char scanner]
(loop [scanner scanner
comm (str char)]
(let [char (current-char scanner)]
(if (= \newline char)
(update scanner :line inc)
(recur (advance scanner) (str comm char))))))
(defn- scan-token [scanner]
(let [char (current-char scanner)
scanner (advance scanner)
next (current-char scanner)]
(case char
;; one-character tokens
\( (add-token scanner :lparen)
;; :break is a special zero-char token before closing braces
;; it makes parsing much simpler
\) (add-token (add-token scanner :break) :rparen)
\{ (add-token scanner :lbrace)
\} (add-token (add-token scanner :break) :rbrace)
\[ (add-token scanner :lbracket)
\] (add-token (add-token scanner :break) :rbracket)
\; (add-token scanner :semicolon)
\, (add-token scanner :comma)
\newline (add-token (update scanner :line inc) :newline)
\\ (add-token scanner :backslash)
\= (add-token scanner :equals)
\> (add-token scanner :pipeline)
;; two-character tokens
;; ->
\- (cond
(= next \>) (add-token (advance scanner) :rarrow)
(digit? next) (add-number char scanner)
:else (add-error scanner (str "Expected -> or negative number after `-`. Got `" char next "`")))
;; dict #{
\# (if (= next \{)
(add-token (advance scanner) :startdict)
(add-error scanner (str "Expected beginning of dict: #{. Got " char next)))
;; set ${
\$ (if (= next \{)
(add-token (advance scanner) :startset)
(add-error scanner (str "Expected beginning of set: ${. Got " char next)))
;; struct @{
\@ (if (= next \{)
(add-token (advance scanner) :startstruct)
(add-error scanner (str "Expected beginning of struct: @{. Got " char next)))
;; placeholders
;; there's a flat _, and then ignored words
\_ (cond
(terminates? next) (add-token scanner :placeholder)
(alpha? next) (add-ignored scanner)
:else (add-error scanner (str "Expected placeholder: _. Got " char next)))
;; comments
;; & starts an inline comment
\& (add-comment char scanner)
;; keywords
\: (cond
(alpha? next) (add-keyword scanner)
:else (add-error scanner (str "Expected keyword. Got " char next)))
;; splats
\. (let [after_next (current-char (advance scanner))]
(if (= ".." (str next after_next))
(add-token (advance (advance scanner)) :splat)
(add-error scanner (str "Expected splat: ... . Got " (str "." next after_next)))))
;; strings
\" (add-string scanner)
;; word matches
(cond
(whitespace? char) scanner ;; for now just skip whitespace characters
(digit? char) (add-number char scanner)
(upper? char) (add-word char scanner) ;; no datatypes for now
(lower? char) (add-word char scanner)
:else (add-error scanner (str "Unexpected character: " char))))))
(defn- next-token [scanner]
(assoc scanner :start (:current scanner)))
(defn scan [source]
(loop [scanner (new-scanner source)]
(if (at-end? scanner)
(let [scanner (add-token (add-token scanner :break) :eof)]
{:tokens (:tokens scanner)
:errors (:errors scanner)})
(recur (-> scanner (scan-token) (next-token))))))