diff --git a/src/ludus/scanner.janet b/src/ludus/scanner.janet new file mode 100644 index 0000000..5bb6e58 --- /dev/null +++ b/src/ludus/scanner.janet @@ -0,0 +1,325 @@ +(def reserved-words + "List of Ludus reserved words." + ## see ludus-spec repo for more info + {"as" :as ## impl for `import`# not yet for patterns + "do" :do ## impl + "else" :else ## impl + "false" :false ## impl -> literal word + "fn" :fn ## impl + "if" :if ## impl + "import" :import ## impl + "let" :let ## impl + "loop" :loop ## impl + "match" :match ## impl + "nil" :nil ## impl -> literal word + "ns" :ns ## impl + "panic!" :panic ## impl (should _not_ be a function) + "recur" :recur ## impl + "ref" :ref ## impl + "then" :then ## impl + "true" :true ## impl -> literal word + "use" :use ## wip + "with" :with ## impl + "when" :when ## impl, replaces cond + "repeat" :repeat ## syntax sugar over "loop": still unclear what this syntax could be + "test" :test + }) + +(def literal-words {"true" true + "false" false + "nil" nil + }) + +(defn- new-scanner + "Creates a new scanner." + [source input] + @{:source source + :input input + :length (length source) + :errors @[] + :start 0 + :current 0 + :line 1 + :tokens @[]}) + +(defn- at-end? + "Tests if a scanner is at end of input." + [scanner] + (>= (get scanner :current) (get scanner :length))) + +(defn- current-char + "Gets the current character of the scanner." + [scanner] + (let [source (get scanner :source) + current (get scanner :current) + length (length source)] + (if (>= current length) + nil + (string/from-bytes (get source current))))) + +(defn- advance + "Advances the scanner by a single character." + [scanner] + (update scanner :current inc)) + +(defn- next-char + "Gets the next character from the scanner." + [scanner] + (let [source (get scanner :source) + current (get scanner :current) + next (inc current) + length (length source)] + (if (>= next length) + nil + (string/from-bytes (get source current))))) + +(defn- current-lexeme + [scanner] + (slice (get scanner :source) (get scanner :start) (get scanner :current))) + +(defn- char-code [char] (get char 0)) + +(defn- char-in-range? [start end char] + (and char + (>= (char-code char) (char-code start)) + (<= (char-code char) (char-code end)))) + +(defn- digit? [c] + (char-in-range? "0" "9" c)) + +(defn- nonzero-digit? [c] + (char-in-range? "1" "9" c)) + +## for now, use very basic ASCII charset in words +## TODO: research the implications of using the whole +## (defn- alpha? [c] (boolean (re-find #"\p{L}" (string c)))) +(defn- alpha? [c] + (or (char-in-range? "a" "z" c) (char-in-range? "A" "Z" c))) + +(defn- lower? [c] (char-in-range? "a" "z" c)) + +(defn- upper? [c] (char-in-range? "A" "Z" c)) + +## legal characters in words +(def word-chars {"_" true "?" true "!" true "*" true "/" true}) + +(defn- word-char? [c] + (or (alpha? c) (digit? c) (get word-chars c))) + +(defn- whitespace? [c] + (or (= c " ") (= c "\t"))) + +(def terminators { +":" true +"#" true +"\n" true +"{" true +"}" true +"(" true +")" true +"[" true +"]" true +"$" true +"#" true +"-" true +"=" true +"&" true +"," true +">" true +# nil was in here; I don't want to be passing in literal nil to this function anyway, and "nil" is also not it... +"\"" true}) + +(defn- terminates? [c] + (or (nil? c) (whitespace? c) (get terminators c))) + +(defn- add-token + [scanner token-type &opt literal] + (update scanner :tokens array/push + {:type token-type + :lexeme (current-lexeme scanner) + :literal literal + :line (get scanner :line) + :start (get scanner :start) + :source (get scanner :source) + :start (get scanner :input)})) + +## TODO: errors should also be in the vector of tokens +## The goal is to be able to be able to hand this to an LSP? +## Do we need a different structure +(defn- add-error [scanner msg] + (let [token {:type :error + :lexeme (current-lexeme scanner) + :literal nil + :line (get scanner :line) + :start (get scanner :start) + :source (get scanner :source) + :input (get scanner :input) + :message msg}] + (-> scanner + (update :errors array/push token) + (update :tokens array/push token)))) + +(defn- add-keyword + [scanner] + (defn recur [scanner key] + (let [char (current-char scanner)] + (cond + (terminates? char) (add-token scanner :keyword (keyword key)) + (word-char? char) (recur (advance scanner) (string key char)) + :else (add-error scanner (string "Unexpected " char "after keyword :" key))))) + (recur scanner "")) + +(defn- read-literal [lit] (-> lit parse-all first)) + +## TODO: improve number parsing? +## Currently this uses Clojure's number formatting rules (since we use the EDN reader) +## These rules are here: https://cljs.github.io/api/syntax/number +(defn- add-number [char scanner] + (defn recur [scanner num float?] + (let [curr (current-char scanner)] + (cond + (= curr "_") (recur (advance scanner) num float?) ## consume underscores unharmed + (= curr ".") (if float? + (add-error scanner (string "Unexpected second decimal point after " num ".")) + (recur (advance scanner) (buffer/push num curr) true)) + (terminates? curr) (add-token scanner :number (read-literal num)) + (digit? curr) (recur (advance scanner) (buffer/push num curr) float?) + :else (add-error scanner (string "Unexpected " curr " after number " num "."))))) + (recur scanner (buffer char) false)) + +## TODO: activate string interpolation +(defn- add-string + [scanner] + (defn recur [scanner buff interpolate?] + (let [char (current-char scanner)] + (case char + "{" (recur (advance scanner) (buffer/push buff char) true) + # allow multiline strings + "\n" (recur (update (advance scanner) :line inc) (buffer/push buff char) interpolate?) + "\"" (if interpolate? + #(add-token (advance scanner) :interpolated string) + #no interpolation yet + (add-token (advance scanner) :string (string buff)) + (add-token (advance scanner) :string (string buff))) + "\\" (let [next (next-char scanner) + scanner (if (= next "\n") + (update scanner :line inc) + scanner)] + (recur (advance (advance scanner)) (buffer/push buff next) interpolate?)) + (if (at-end? scanner) + (add-error scanner "Unterminated string.") + (recur (advance scanner) (buffer/push buff char) interpolate?))))) + (recur scanner @"" false)) + +(defn- add-word + [char scanner] + (defn recur [scanner word] + (let [curr (current-char scanner)] + (cond + (terminates? curr) (add-token scanner + (get reserved-words (string word) :word) + (get literal-words (string word) :none)) + (word-char? curr) (recur (advance scanner) (buffer/push word curr)) + :else (add-error scanner (string "Unexpected " curr " after word " word "."))))) + (recur scanner (buffer char))) + +(defn- add-ignored + [scanner] + (defn recur [scanner ignored] + (let [char (current-char scanner)] + (cond + (terminates? char) (add-token scanner :ignored) + (word-char? char) (recur (advance scanner) (buffer/push ignored char)) + :else (add-error scanner (string "Unexpected " char " after word " ignored "."))))) + (recur scanner @"_")) + +(defn- add-comment [char scanner] + (defn recur [scanner comm] + (let [char (current-char scanner)] + (if (= "\n" char) + scanner # for now, we don't do anything with comments; can be added later + (recur (advance scanner) (buffer/push comm char))))) + (recur scanner (buffer char))) + +(defn- scan-token [scanner] + (let [char (current-char scanner) + scanner (advance scanner) + next (current-char scanner)] + (case char + ## one-character tokens + ## :break is a special zero-char token before closing braces + ## it makes parsing much simpler + "(" (add-token scanner :lparen) + ")" (add-token (add-token scanner :break) :rparen) + "{" (add-token scanner :lbrace) + "}" (add-token (add-token scanner :break) :rbrace) + "[" (add-token scanner :lbracket) + "]" (add-token (add-token scanner :break) :rbracket) + "#" (add-token scanner :semicolon) + "," (add-token scanner :comma) + "\n" (add-token (update scanner :line inc) :newline) + "\\" (add-token scanner :backslash) + "=" (add-token scanner :equals) + ">" (add-token scanner :pipeline) + + ## two-character tokens + ## -> + "-" (cond + (= next ">") (add-token (advance scanner) :rarrow) + (digit? next) (add-number char scanner) + :else (add-error scanner (string "Expected -> or negative number after `-`. Got `" char next "`"))) + + ## dict #{ + "#" (if (= next "{") + (add-token (advance scanner) :startdict) + (add-error scanner (string "Expected beginning of dict: #{. Got " char next))) + + ## set ${ + "$" (if (= next "{") + (add-token (advance scanner) :startset) + (add-error scanner (string "Expected beginning of set: ${. Got " char next))) + + ## placeholders + ## there's a flat _, and then ignored words + "_" (cond + (terminates? next) (add-token scanner :placeholder) + (alpha? next) (add-ignored scanner) + :else (add-error scanner (string "Expected placeholder: _. Got " char next))) + + ## comments + ## & starts an inline comment + "&" (add-comment char scanner) + + ## keywords + ":" (cond + (alpha? next) (add-keyword scanner) + :else (add-error scanner (string "Expected keyword. Got " char next))) + + ## splats + "." (let [after_next (current-char (advance scanner))] + (if (= ".." (string next after_next)) + (add-token (advance (advance scanner)) :splat) + (add-error scanner (string "Expected splat: ... . Got " (string "." next after_next))))) + + ## strings + "\"" (add-string scanner) + + ## word matches + (cond + (whitespace? char) scanner ## for now just skip whitespace characters + (digit? char) (add-number char scanner) + (upper? char) (add-word char scanner) ## no datatypes for now + (lower? char) (add-word char scanner) + :else (add-error scanner (string "Unexpected character: " char)))))) + +(defn- next-token [scanner] + (put scanner :start (get scanner :current))) + +(defn scan [source input] + (defn recur [scanner] + (if (at-end? scanner) + (let [scanner (add-token (add-token scanner :break) :eof)] + {:tokens (get scanner :tokens) + :errors (get scanner :errors)}) + (recur (-> scanner (scan-token) (next-token))))) + (recur (new-scanner source input)))