A mostly-complete scanner!

This commit is contained in:
Scott Richmond 2022-02-06 18:20:55 -05:00
parent bbe20d4af1
commit c3426624c9

View File

@ -1,11 +1,15 @@
(ns ludus.scanner
(:require [ludus.token :as token]
[clojure.pprint :as pp]))
(:require
[ludus.token :as token]
[clojure.pprint :as pp]
[clojure.edn :as edn]
[clojure.string :as s]))
(def reserved-words
"List of Ludus reserved words."
;; see ludus-spec repo for more info
#{
"as"
"cond"
"else"
"false"
@ -18,13 +22,17 @@
"then"
"true"
"var"
"with" ;;maybe
"with"
;; below here, probable
"defer"
"gen"
"loop"
"ns"
"recur"
"repeat"
"test"
"wait"
"yield"
;; below here, possible
"when"
})
@ -77,10 +85,11 @@
(char-in-range? \1 \9 c))
;; for now, use very basic ASCII charset in words
;; TODO: research the implications of using the whole
;; (defn- alpha? [c] (boolean (re-find #"\p{L}" (str c))))
(defn- alpha? [c]
(or (char-in-range? \a \z c) (char-in-range? \A \Z c)))
;; (defn- alpha? [c] (boolean (re-find #"\p{L}" (str c))))
;; legal characters in words
(def word-chars #{\_ \? \! \* \/})
@ -91,7 +100,7 @@
(defn- whitespace? [c]
(or (= c \space) (= c \tab)))
(def terminators #{\: \; \newline \{ \} \( \) \[ \] \$ \# \- \< \& \, \|})
(def terminators #{\: \; \newline \{ \} \( \) \[ \] \$ \# \- \< \& \, \| nil \\})
(defn- terminates? [c]
(or (whitespace? c) (contains? terminators c)))
@ -109,29 +118,39 @@
(::start scanner)))))
;; TODO: errors should also be in the vector of tokens
;; The goal is to be able to be able to hand this to an LSP
;; The goal is to be able to be able to hand this to an LSP?
;; Do we need a different structure
(defn- add-error [scanner msg]
(update scanner ::errors conj {:msg msg :line (::line scanner) :start (::start scanner)}))
;; TODO: finish this
(defn- scan-keyword
([scanner] (scan-keyword scanner scanner))
([start current]))
(defn- add-keyword
[scanner]
(loop [scanner scanner
key ""]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner ::token/keyword (keyword key))
(word-char? char) (recur (advance scanner) (str key char))
:else (add-error scanner "Unexpected " char "after keyword :" key)))))
(defn- add-keyword [scanner]
(let [advanced (advance scanner)
char (current-char advanced)]
(if (not (alpha? char))
(add-error scanner (str "Keywords must start with a letter, e.g. :foo. Got " \: char))
(scan-keyword advanced))))
(defn- add-zero-start [scanner])
(defn- add-number [scanner]
(let [current (current-char scanner)]
(if (nonzero-digit? current)
(loop [current current]))))
;; TODO: improve number parsing?
;; This will currently parse 000001 as 1. Is that correct behaviour?
(defn- add-number [char scanner]
(loop [scanner scanner
num (str char)
float? false]
(let [curr (current-char scanner)]
(cond
(= curr \_) (recur (advance scanner) num float?) ;; consume underscores unharmed
(= curr \.) (if float?
(add-error scanner (str "Unexpected second decimal point after " num "."))
(recur (advance scanner) (str num curr) true))
(terminates? curr) (add-token scanner ::token/number (edn/read-string num))
(digit? curr) (recur (advance scanner) (str num curr) float?)
:else (add-error scanner (str "Unexpected " curr " after number " num "."))))))
;; TODO: add string interpolation
;; This still has to be devised
(defn- add-string
[scanner]
(loop [scanner scanner
@ -140,27 +159,51 @@
(case char
\newline (add-error scanner "Unterminated string.")
\" (add-token (advance scanner) ::token/string string)
\\ (recur (advance (advance scanner)) (str string (next-char scanner)))
\\ (let [next (next-char scanner)
scanner (if (= next \newline)
(update scanner ::line inc)
scanner)]
(recur (advance (advance scanner)) (str string next)))
(if (at-end? scanner)
(add-error scanner "Unterminated string.")
(recur (advance scanner) (str string char)))))))
(defn- add-word
[char scanner]
(loop [scanner scanner
word (str char)]
(let [curr (current-char scanner)]
(cond
(terminates? curr) (if (contains? reserved-words word)
(add-token scanner ::token/reserved)
(add-token scanner ::token/word))
(word-char? curr) (recur (advance scanner) (str word curr))
:else (add-error scanner (str "Unexpected " curr " after word " word "."))))))
(defn- add-ignored
[scanner]
(loop [scanner scanner
word ""])
(let [char (current-char scanner)]))
ignored "_"]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner ::token/ignored)
(word-char? char) (recur (advance scanner) (str ignored char))
:else (add-error scanner (str "Unexpected " char " after word " ignored "."))))))
(defn- skip-comment [scanner]
(if (= \newline (current-char scanner))
(advance scanner)
(recur (advance scanner))))
(defn- add-comment [char scanner]
(loop [scanner scanner
comm (str char)]
(let [char (current-char scanner)]
(if (= \newline char)
(if (s/starts-with? comm "&&&")
(add-token (update scanner ::line inc) ::token/docstring)
(add-token (update scanner ::line inc) ::token/comment))
(recur (advance scanner) (str comm char))))))
(defn- scan-token [scanner]
(let [char (current-char scanner)
scanner (advance scanner)
next (current-char scanner)
]
next (current-char scanner)]
(case char
;; one-character tokens
\( (add-token scanner ::token/lparen)
@ -171,15 +214,15 @@
\] (add-token scanner ::token/rbracket)
\; (add-token scanner ::token/semicolon)
\, (add-token scanner ::token/comma)
\newline (add-token scanner ::token/newline)
\newline (add-token (update scanner ::line inc) ::token/newline)
\\ (add-token scanner ::token/backslash)
;; two-character tokens
;; ->
\- (cond
(= next \>) (add-token (advance scanner) ::token/rarrow)
(digit? next) (add-number scanner)
(add-error scanner ("Expected -> or negative number. Got " char next)))
(digit? next) (add-number char scanner)
:else (add-error scanner (str "Expected -> or negative number. Got " char next)))
;; <-
\< (if (= next \-)
@ -206,34 +249,45 @@
(add-token (advance scanner) ::token/startset)
(add-error scanner (str "Expected beginning of set: ${. Got " char next)))
;; placeholder
;; TODO: add named placeholder
\_ (if (terminates? next)
(add-token scanner ::token/placeholder)
(add-word scanner))
;; placeholders
;; there's a flat _, and then ignored words
\_ (cond
(terminates? next) (add-token scanner ::token/placeholder)
(alpha? next) (add-ignored scanner)
:else (add-error scanner (str "Expected placeholder: _. Got " char next)))
;; comments
;; & starts an inline comment
;; TODO: include comments in scanned file
;; TODO: add doc comments: &&&
\& (skip-comment scanner)
\& (add-comment char scanner)
;; keywords
;; TODO: instead of a separate token, scan a whole type keyword
;; e.g. ::string, ::number
\: (cond
;;(= \: next) (add-token (advance scanner) ::token/doublecolon))
(alpha? next) (add-word scanner)
:else (add-error scanner (str "Expected keyword. Got " char next))
(alpha? next) (add-keyword scanner)
:else (add-error scanner (str "Expected keyword. Got " char next)))
;; splats
\. (let [after_next (current-char (advance scanner))]
(if (= ".." (str next after_next))
(add-token (advance (advance scanner)) ::token/splat)
(add-error scanner (str "Expected splat: ... . Got " (str "." next after_next)))))
;; strings
\" (add-string scanner)
;; word matches
(cond
(whitespace? char) scanner ;; TODO: include whitespace in scan
;; (digit? char) (add-number scanner)
;; (alpha? char) (add-word scanner)
(whitespace? char) (loop [scanner scanner ws (str char)]
(let [curr (current-char scanner)]
(if (whitespace? curr)
(recur (advance scanner) (str ws curr))
(add-token scanner ::token/ws))))
(digit? char) (add-number char scanner)
(alpha? char) (add-word char scanner)
:else (add-error scanner (str "Unexpected character: " char))))))
(defn- next-token [scanner]
@ -242,18 +296,24 @@
(defn scan [source]
(loop [scanner (new-scanner source)]
(if (at-end? scanner)
(let [scanner (add-token scanner ::eof)]
(let [scanner (add-token scanner ::token/eof)]
{:tokens (::tokens scanner)
:errors (::errors scanner)})
(recur (-> scanner (scan-token) (next-token))))))
(let [source "\"foo\\\nbar\"\n)"]
(let [source "&&1 abc qde\n\n"]
(scan source))
;; string scanning is (I think) working
;; line counting is not working
;; do I just save a location and then calculate line numbers if an error happens?
;; next up: numbers!