(def reserved-words "List of Ludus reserved words." ## see ludus-spec repo for more info { "as" :as ## impl "box" :box "do" :do ## impl "else" :else ## impl "false" :false ## impl -> literal word "fn" :fn ## impl "if" :if ## impl "import" :import ## impl "let" :let ## impl "loop" :loop ## impl "match" :match ## impl "nil" :nil ## impl -> literal word "ns" :ns ## impl "panic!" :panic ## impl (should _not_ be a function) "pkg" :pkg "recur" :recur ## impl "repeat" :repeat ## impl "test" :test "then" :then ## impl "true" :true ## impl -> literal word "use" :use ## wip "when" :when ## impl, replaces cond "with" :with ## impl }) (def literal-words {"true" true "false" false "nil" nil }) (defn- new-scanner "Creates a new scanner." [source input] @{:source source :input input :length (length source) :errors @[] :start 0 :current 0 :line 1 :tokens @[]}) (defn- at-end? "Tests if a scanner is at end of input." [scanner] (>= (get scanner :current) (get scanner :length))) (defn- current-char "Gets the current character of the scanner." [scanner] (let [source (get scanner :source) current (get scanner :current) length (length source)] (if (>= current length) nil (string/from-bytes (get source current))))) (defn- advance "Advances the scanner by a single character." [scanner] (update scanner :current inc)) (defn- next-char "Gets the next character from the scanner." [scanner] (let [source (get scanner :source) current (get scanner :current) next (inc current) length (length source)] (if (>= next length) nil (string/from-bytes (get source next))))) (defn- current-lexeme [scanner] (slice (get scanner :source) (get scanner :start) (get scanner :current))) (defn- char-code [char] (get char 0)) (defn- char-in-range? [start end char] (and char (>= (char-code char) (char-code start)) (<= (char-code char) (char-code end)))) (defn- digit? [c] (char-in-range? "0" "9" c)) (defn- nonzero-digit? [c] (char-in-range? "1" "9" c)) ## for now, use very basic ASCII charset in words ## TODO: research the implications of using the whole ## (defn- alpha? [c] (boolean (re-find #"\p{L}" (string c)))) (defn- alpha? [c] (or (char-in-range? "a" "z" c) (char-in-range? "A" "Z" c))) (defn- lower? [c] (char-in-range? "a" "z" c)) (defn- upper? [c] (char-in-range? "A" "Z" c)) ## legal characters in words (def word-chars {"_" true "?" true "!" true "*" true "/" true}) (defn- word-char? [c] (or (alpha? c) (digit? c) (get word-chars c))) (defn- whitespace? [c] (or (= c " ") (= c "\t"))) (def terminators { ":" true ";" true "\n" true "{" true "}" true "(" true ")" true "[" true "]" true "$" true "#" true "-" true "=" true "&" true "," true ">" true "\"" true}) (defn- terminates? [c] (or (nil? c) (whitespace? c) (get terminators c))) (defn- add-token [scanner token-type &opt literal] (update scanner :tokens array/push {:type token-type :lexeme (current-lexeme scanner) :literal literal :line (get scanner :line) :start (get scanner :start) :source (get scanner :source) :input (get scanner :input)})) ## TODO: errors should also be in the vector of tokens ## The goal is to be able to be able to hand this to an LSP? ## Do we need a different structure (defn- add-error [scanner msg] (let [token {:type :error :lexeme (current-lexeme scanner) :literal nil :line (get scanner :line) :start (get scanner :start) :source (get scanner :source) :input (get scanner :input) :msg msg}] (-> scanner (update :errors array/push token) (update :tokens array/push token)))) (defn- add-keyword [scanner] (defn recur [scanner key] (let [char (current-char scanner)] (cond (terminates? char) (add-token scanner :keyword (keyword key)) (word-char? char) (recur (advance scanner) (string key char)) :else (add-error scanner (string "Unexpected " char "after keyword :" key))))) (recur scanner "")) (defn- add-pkg-kw [scanner] (defn recur [scanner key] (let [char (current-char scanner)] (cond (terminates? char) (add-token scanner :pkg-kw (keyword key)) (word-char? char) (recur (advance scanner) (string key char)) :else (add-error scanner (string "Unexpected " char " after pkg keyword :" key))))) (recur scanner "")) (defn- read-literal [lit] (-> lit parse-all first)) ### TODO: consider whether Janet's number rules are right for Ludus (defn- add-number [char scanner] (defn recur [scanner num float?] (let [curr (current-char scanner)] (cond (= curr "_") (recur (advance scanner) num float?) ## consume underscores unharmed (= curr ".") (if float? (add-error scanner (string "Unexpected second decimal point after " num ".")) (recur (advance scanner) (buffer/push num curr) true)) (terminates? curr) (add-token scanner :number (read-literal num)) (digit? curr) (recur (advance scanner) (buffer/push num curr) float?) :else (add-error scanner (string "Unexpected " curr " after number " num "."))))) (recur scanner (buffer char) false)) (def escape { "\"" "\"" "n" "\n" "{" "{" "t" "\t" "r" "\r" "\\" "\\" }) (defn- add-string [scanner] (defn recur [scanner buff interpolate?] (let [char (current-char scanner)] (case char "{" (recur (advance scanner) (buffer/push buff char) true) # allow multiline strings "\n" (recur (update (advance scanner) :line inc) (buffer/push buff char) interpolate?) "\"" (add-token (advance scanner) (if interpolate? :interpolated :string) (string buff)) "\\" (let [next (next-char scanner)] (recur (advance (advance scanner)) (buffer/push buff (get escape next next)) interpolate?)) (if (at-end? scanner) (add-error scanner "Unterminated string.") (recur (advance scanner) (buffer/push buff char) interpolate?))))) (recur scanner @"" false)) (defn- add-word [char scanner] (defn recur [scanner word] (let [curr (current-char scanner)] (cond (terminates? curr) (add-token scanner (get reserved-words (string word) :word) (get literal-words (string word) :none)) (word-char? curr) (recur (advance scanner) (buffer/push word curr)) :else (add-error scanner (string "Unexpected " curr " after word " word "."))))) (recur scanner (buffer char))) (defn- add-pkg [char scanner] (defn recur [scanner pkg] (let [curr (current-char scanner)] (cond (terminates? curr) (add-token scanner :pkg-name :none) (word-char? curr) (recur (advance scanner) (buffer/push pkg curr)) :else (add-error scanner (string "unexpected " curr " after pkg name " pkg))))) (recur scanner (buffer char))) (defn- add-ignored [scanner] (defn recur [scanner ignored] (let [char (current-char scanner)] (cond (terminates? char) (add-token scanner :ignored) (word-char? char) (recur (advance scanner) (buffer/push ignored char)) :else (add-error scanner (string "Unexpected " char " after word " ignored "."))))) (recur scanner @"_")) (defn- add-comment [char scanner] (defn recur [scanner comm] (let [char (current-char scanner)] (if (or (= "\n" char) (at-end? scanner)) scanner # for now, we don't do anything with comments; can be added later (recur (advance scanner) (buffer/push comm char))))) (recur scanner (buffer char))) (defn- scan-token [scanner] (let [char (current-char scanner) scanner (advance scanner) next (current-char scanner)] (case char ## one-character tokens ## :break is a special zero-char token before closing braces ## it makes parsing much simpler "(" (add-token scanner :lparen) ")" (add-token (add-token scanner :break) :rparen) "{" (add-token scanner :lbrace) "}" (add-token (add-token scanner :break) :rbrace) "[" (add-token scanner :lbracket) "]" (add-token (add-token scanner :break) :rbracket) ";" (add-token scanner :semicolon) "," (add-token scanner :comma) "\n" (add-token (update scanner :line inc) :newline) "\\" (add-token scanner :backslash) "=" (add-token scanner :equals) ">" (add-token scanner :pipeline) ## two-character tokens ## -> "-" (cond (= next ">") (add-token (advance scanner) :arrow) (digit? next) (add-number char scanner) :else (add-error scanner (string "Expected > or negative number after `-`. Got `" char next "`"))) ## dict #{ "#" (if (= next "{") (add-token (advance scanner) :startdict) (add-error scanner (string "Expected beginning of dict: #{. Got " char next))) ## set ${ "$" (if (= next "{") (add-token (advance scanner) :startset) (add-error scanner (string "Expected beginning of set: ${. Got " char next))) ## placeholders ## there's a flat _, and then ignored words "_" (cond (terminates? next) (add-token scanner :placeholder) (alpha? next) (add-ignored scanner) :else (add-error scanner (string "Expected placeholder: _. Got " char next))) ## comments ## & starts an inline comment "&" (add-comment char scanner) ## keywords # XXX: make sure we want only lower-only keywords ":" (cond (lower? next) (add-keyword scanner) (upper? next) (add-pkg-kw scanner) :else (add-error scanner (string "Expected keyword or pkg keyword. Got " char next))) ## splats "." (let [after_next (current-char (advance scanner))] (if (= ".." (string next after_next)) (add-token (advance scanner) :splat) (add-error scanner (string "Expected splat: ... . Got " (string "." next after_next))))) ## strings "\"" (add-string scanner) ## word matches (cond (whitespace? char) scanner ## for now just skip whitespace characters (digit? char) (add-number char scanner) (upper? char) (add-pkg char scanner) (lower? char) (add-word char scanner) :else (add-error scanner (string "Unexpected character: " char)))))) (defn- next-token [scanner] (put scanner :start (get scanner :current))) (defn scan [source &opt input] (default input :input) (defn recur [scanner] (if (at-end? scanner) (let [scanner (add-token (add-token scanner :break) :eof)] {:tokens (get scanner :tokens) :errors (get scanner :errors [])}) (recur (-> scanner (scan-token) (next-token))))) (recur (new-scanner source input))) # (comment (do (def source " -123 ") (length ((scan source) :tokens)))