ludus/janet/scanner.janet

356 lines
11 KiB
Plaintext
Raw Permalink Normal View History

2024-01-08 01:10:16 +00:00
(def reserved-words
"List of Ludus reserved words."
## see ludus-spec repo for more info
2024-01-19 21:50:01 +00:00
{"as" :as ## impl
"box" :ref
2024-01-08 01:10:16 +00:00
"do" :do ## impl
"else" :else ## impl
"false" :false ## impl -> literal word
"fn" :fn ## impl
"if" :if ## impl
"import" :import ## impl
"let" :let ## impl
"loop" :loop ## impl
"match" :match ## impl
"nil" :nil ## impl -> literal word
"ns" :ns ## impl
"panic!" :panic ## impl (should _not_ be a function)
2024-05-14 17:45:41 +00:00
"pkg" :pkg
2024-01-08 01:10:16 +00:00
"recur" :recur ## impl
"then" :then ## impl
"true" :true ## impl -> literal word
"use" :use ## wip
"with" :with ## impl
"when" :when ## impl, replaces cond
"repeat" :repeat ## syntax sugar over "loop": still unclear what this syntax could be
"test" :test
})
(def literal-words {"true" true
"false" false
"nil" nil
})
(defn- new-scanner
"Creates a new scanner."
[source input]
@{:source source
:input input
:length (length source)
:errors @[]
:start 0
:current 0
:line 1
:tokens @[]})
(defn- at-end?
"Tests if a scanner is at end of input."
[scanner]
(>= (get scanner :current) (get scanner :length)))
(defn- current-char
"Gets the current character of the scanner."
[scanner]
(let [source (get scanner :source)
current (get scanner :current)
length (length source)]
(if (>= current length)
nil
(string/from-bytes (get source current)))))
(defn- advance
"Advances the scanner by a single character."
[scanner]
(update scanner :current inc))
(defn- next-char
"Gets the next character from the scanner."
[scanner]
(let [source (get scanner :source)
current (get scanner :current)
next (inc current)
length (length source)]
(if (>= next length)
nil
(string/from-bytes (get source next)))))
2024-01-08 01:10:16 +00:00
(defn- current-lexeme
[scanner]
(slice (get scanner :source) (get scanner :start) (get scanner :current)))
(defn- char-code [char] (get char 0))
(defn- char-in-range? [start end char]
(and char
(>= (char-code char) (char-code start))
(<= (char-code char) (char-code end))))
(defn- digit? [c]
(char-in-range? "0" "9" c))
(defn- nonzero-digit? [c]
(char-in-range? "1" "9" c))
## for now, use very basic ASCII charset in words
## TODO: research the implications of using the whole
## (defn- alpha? [c] (boolean (re-find #"\p{L}" (string c))))
(defn- alpha? [c]
(or (char-in-range? "a" "z" c) (char-in-range? "A" "Z" c)))
(defn- lower? [c] (char-in-range? "a" "z" c))
(defn- upper? [c] (char-in-range? "A" "Z" c))
## legal characters in words
(def word-chars {"_" true "?" true "!" true "*" true "/" true})
(defn- word-char? [c]
(or (alpha? c) (digit? c) (get word-chars c)))
(defn- whitespace? [c]
(or (= c " ") (= c "\t")))
(def terminators {
":" true
2024-01-19 21:50:01 +00:00
";" true
2024-01-08 01:10:16 +00:00
"\n" true
"{" true
"}" true
"(" true
")" true
"[" true
"]" true
"$" true
"#" true
"-" true
"=" true
"&" true
"," true
">" true
"\"" true})
(defn- terminates? [c]
(or (nil? c) (whitespace? c) (get terminators c)))
(defn- add-token
[scanner token-type &opt literal]
(update scanner :tokens array/push
{:type token-type
:lexeme (current-lexeme scanner)
:literal literal
:line (get scanner :line)
:start (get scanner :start)
:source (get scanner :source)
2024-01-19 21:50:01 +00:00
:input (get scanner :input)}))
2024-01-08 01:10:16 +00:00
## TODO: errors should also be in the vector of tokens
## The goal is to be able to be able to hand this to an LSP?
## Do we need a different structure
(defn- add-error [scanner msg]
(let [token {:type :error
:lexeme (current-lexeme scanner)
:literal nil
:line (get scanner :line)
:start (get scanner :start)
:source (get scanner :source)
:input (get scanner :input)
2024-06-14 18:53:23 +00:00
:msg msg}]
2024-01-08 01:10:16 +00:00
(-> scanner
(update :errors array/push token)
(update :tokens array/push token))))
(defn- add-keyword
[scanner]
(defn recur [scanner key]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner :keyword (keyword key))
(word-char? char) (recur (advance scanner) (string key char))
:else (add-error scanner (string "Unexpected " char "after keyword :" key)))))
(recur scanner ""))
2024-05-20 22:04:24 +00:00
(defn- add-pkg-kw [scanner]
(defn recur [scanner key]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner :pkg-kw (keyword key))
(word-char? char) (recur (advance scanner) (string key char))
:else (add-error scanner (string "Unexpected " char " after pkg keyword :" key)))))
(recur scanner ""))
2024-01-08 01:10:16 +00:00
(defn- read-literal [lit] (-> lit parse-all first))
### TODO: consider whether Janet's number rules are right for Ludus
2024-01-08 01:10:16 +00:00
(defn- add-number [char scanner]
(defn recur [scanner num float?]
(let [curr (current-char scanner)]
(cond
(= curr "_") (recur (advance scanner) num float?) ## consume underscores unharmed
(= curr ".") (if float?
(add-error scanner (string "Unexpected second decimal point after " num "."))
(recur (advance scanner) (buffer/push num curr) true))
(terminates? curr) (add-token scanner :number (read-literal num))
(digit? curr) (recur (advance scanner) (buffer/push num curr) float?)
:else (add-error scanner (string "Unexpected " curr " after number " num ".")))))
(recur scanner (buffer char) false))
2024-06-14 19:25:05 +00:00
(def escape {
"\"" "\""
"n" "\n"
"{" "{"
"t" "\t"
"r" "\r"
"\\" "\\"
})
2024-01-08 01:10:16 +00:00
(defn- add-string
[scanner]
(defn recur [scanner buff interpolate?]
(let [char (current-char scanner)]
(case char
"{" (recur (advance scanner) (buffer/push buff char) true)
# allow multiline strings
"\n" (recur (update (advance scanner) :line inc) (buffer/push buff char) interpolate?)
2024-06-10 22:26:48 +00:00
"\"" (add-token (advance scanner) (if interpolate? :interpolated :string) (string buff))
"\\" (let [next (next-char scanner)]
2024-06-14 19:25:05 +00:00
(recur
(advance (advance scanner))
(buffer/push buff (get escape next next))
interpolate?))
2024-01-08 01:10:16 +00:00
(if (at-end? scanner)
(add-error scanner "Unterminated string.")
(recur (advance scanner) (buffer/push buff char) interpolate?)))))
(recur scanner @"" false))
(defn- add-word
[char scanner]
(defn recur [scanner word]
(let [curr (current-char scanner)]
(cond
(terminates? curr) (add-token scanner
(get reserved-words (string word) :word)
(get literal-words (string word) :none))
(word-char? curr) (recur (advance scanner) (buffer/push word curr))
:else (add-error scanner (string "Unexpected " curr " after word " word ".")))))
(recur scanner (buffer char)))
2024-05-10 18:29:12 +00:00
(defn- add-pkg
[char scanner]
(defn recur [scanner pkg]
(let [curr (current-char scanner)]
(cond
2024-05-10 19:02:55 +00:00
(terminates? curr) (add-token scanner :pkg-name :none)
2024-05-10 18:29:12 +00:00
(word-char? curr) (recur (advance scanner) (buffer/push pkg curr))
:else (add-error scanner (string "unexpected " curr " after pkg name " pkg)))))
(recur scanner (buffer char)))
2024-01-08 01:10:16 +00:00
(defn- add-ignored
[scanner]
(defn recur [scanner ignored]
(let [char (current-char scanner)]
(cond
(terminates? char) (add-token scanner :ignored)
(word-char? char) (recur (advance scanner) (buffer/push ignored char))
:else (add-error scanner (string "Unexpected " char " after word " ignored ".")))))
(recur scanner @"_"))
(defn- add-comment [char scanner]
(defn recur [scanner comm]
(let [char (current-char scanner)]
(if (or (= "\n" char) (at-end? scanner))
2024-01-08 01:10:16 +00:00
scanner # for now, we don't do anything with comments; can be added later
(recur (advance scanner) (buffer/push comm char)))))
(recur scanner (buffer char)))
(defn- scan-token [scanner]
(let [char (current-char scanner)
scanner (advance scanner)
next (current-char scanner)]
(case char
## one-character tokens
## :break is a special zero-char token before closing braces
## it makes parsing much simpler
"(" (add-token scanner :lparen)
")" (add-token (add-token scanner :break) :rparen)
"{" (add-token scanner :lbrace)
"}" (add-token (add-token scanner :break) :rbrace)
"[" (add-token scanner :lbracket)
"]" (add-token (add-token scanner :break) :rbracket)
2024-01-19 21:50:01 +00:00
";" (add-token scanner :semicolon)
2024-01-08 01:10:16 +00:00
"," (add-token scanner :comma)
"\n" (add-token (update scanner :line inc) :newline)
"\\" (add-token scanner :backslash)
"=" (add-token scanner :equals)
2024-05-12 03:25:07 +00:00
">" (add-token scanner :pipeline)
2024-01-08 01:10:16 +00:00
## two-character tokens
## ->
"-" (cond
2024-05-08 17:59:46 +00:00
(= next ">") (add-token (advance scanner) :arrow)
2024-01-08 01:10:16 +00:00
(digit? next) (add-number char scanner)
:else (add-error scanner (string "Expected > or negative number after `-`. Got `" char next "`")))
2024-01-08 01:10:16 +00:00
## dict #{
"#" (if (= next "{")
(add-token (advance scanner) :startdict)
(add-error scanner (string "Expected beginning of dict: #{. Got " char next)))
## set ${
"$" (if (= next "{")
(add-token (advance scanner) :startset)
(add-error scanner (string "Expected beginning of set: ${. Got " char next)))
## placeholders
## there's a flat _, and then ignored words
"_" (cond
(terminates? next) (add-token scanner :placeholder)
(alpha? next) (add-ignored scanner)
:else (add-error scanner (string "Expected placeholder: _. Got " char next)))
## comments
## & starts an inline comment
"&" (add-comment char scanner)
## keywords
2024-05-14 17:45:41 +00:00
# XXX: make sure we want only lower-only keywords
2024-01-08 01:10:16 +00:00
":" (cond
2024-05-14 17:45:41 +00:00
(lower? next) (add-keyword scanner)
2024-05-20 22:04:24 +00:00
(upper? next) (add-pkg-kw scanner)
:else (add-error scanner (string "Expected keyword or pkg keyword. Got " char next)))
2024-01-08 01:10:16 +00:00
## splats
"." (let [after_next (current-char (advance scanner))]
(if (= ".." (string next after_next))
2024-04-29 22:38:08 +00:00
(add-token (advance scanner) :splat)
2024-01-08 01:10:16 +00:00
(add-error scanner (string "Expected splat: ... . Got " (string "." next after_next)))))
## strings
"\"" (add-string scanner)
## word matches
(cond
(whitespace? char) scanner ## for now just skip whitespace characters
(digit? char) (add-number char scanner)
2024-05-10 18:29:12 +00:00
(upper? char) (add-pkg char scanner)
2024-01-08 01:10:16 +00:00
(lower? char) (add-word char scanner)
:else (add-error scanner (string "Unexpected character: " char))))))
(defn- next-token [scanner]
(put scanner :start (get scanner :current)))
2024-01-19 21:50:01 +00:00
(defn scan [source &opt input]
(default input :input)
2024-01-08 01:10:16 +00:00
(defn recur [scanner]
(if (at-end? scanner)
(let [scanner (add-token (add-token scanner :break) :eof)]
{:tokens (get scanner :tokens)
:errors (get scanner :errors [])})
2024-01-08 01:10:16 +00:00
(recur (-> scanner (scan-token) (next-token)))))
(recur (new-scanner source input)))
2024-01-19 21:50:01 +00:00
2024-06-14 19:25:05 +00:00
(comment
# (do
2024-06-21 19:28:46 +00:00
(def source "add 1 2 () four")
2024-06-14 18:53:23 +00:00
(scan source)
)