ludus/janet/myparser.janet
2024-04-06 19:12:44 -04:00

393 lines
13 KiB
Plaintext

(comment
So the thing here is that this is a much, much simpler version of the rather unweildy Clojure parser framework I wrote. This leans pretty hard on the Janet PEG framework, but is also bespoke for my purposes here.
This lets me parse not raw text, but scanned tokens, which makes parsing a much easier lift. Also, it lets me do things like insert :break tokens, which really helps manage collection types and blocks.
At current, I do need to shift to returning a data structure rather than `nil`s and `true`s. That way, I can advance the parser by a certain amount, however much a passing test-rule would consume.
Do I really need to optimize this? I don't think so: we can do that later. I could store various information on stacks.
Alternately, returning data structures may well complicate things: KISS!
The idea, however, is that Jesus: we must not have test-rule and apply-rule: combinators without a fixed advance amount can't work that way. So we have only `apply-rule`, which `advance`s the parser by however much should be consumed. BUT! We can always, and ought wherever we might, stash the origin index and roll back any changes (i.e. in `!` and `*`)
)
(defn pprint [x] (printf "%M" x))
(os/cd "janet") ### XXX: to remove, repl only
(import ./scanner :prefix "")
(defn new-parser
``
Creates a new parser from a list of tokens. A parser has three fields in addition to `:tokens`:
:i the index, in the list of tokens, of the current token
:errors an array of errors, treated like a stack
:captured an array of captures (tokens or arrays of tokens), treated like a stack.
``
[{:tokens tokens}]
@{:tokens tokens :i 0 :errors @[] :captured @[]})
(defn pop-to
"Pops items from a stack until it contains `len` items. Returns the stack."
[stack len]
(assert (>= len 0))
(while (< len (length stack)) (array/pop stack))
stack)
(defn popn
"Pops n items from a stack. Returns the stack."
[stack n]
(loop [_ :range [0 n]] (array/pop stack))
stack)
(defn current "Returns the current token." [{:tokens tokens :i i}] (get tokens i))
(defn previous "Returns the token before the current token." [{:tokens tokens :i i}] (get tokens (dec i)))
(defn next "Returns the token after the current token." [{:tokens tokens :i i}] (get tokens (inc i)))
(def add +) ### I `def` `+` later; stash this
(defn advance
"Advances the parser by `count`, or 1 if count is not supplied."
[parser] (update parser :i inc))
(defn stash
"Stashes a parser state: captures the current index, error, and capture stacks."
[{:i i :errors errs :captured cap}]
{:i i :err-len (length errs) :cap-len (length cap)})
(defn restore
"Restores the parser to a stashed state: resets :i, pops any new errors and captures off their stacks."
[parser state]
(def {:i i :err-len err-len :cap-len cap-len} state)
(def {:errors errors :captured captured} parser)
(put parser :i i)
(pop-to errors err-len)
(pop-to captured cap-len)
parser)
(defn at "Returns the token at index i." [parser i] (get (get parser :tokens) i))
(defn capture "Captures the last token." [parser] (update parser :captured array/push (previous parser)))
(defn apply-keyword
"Applies a keyword-based rule. Consumes the passing token."
[kw parser]
(def curr (current parser))
(def type (get curr :type))
(if (= kw type)
(do
(advance parser)
true)
nil))
(defn apply-table
"Applies a table-based rule."
[rule parser]
(def curr (current parser))
(def rule-fn (get rule :rule))
(rule-fn parser))
(defn apply-rule
"Applies a rule: if the rule passes, consume all passing tokens and return true. If the rule fails, do nothing, returning `nil`."
[rule parser]
(case (type rule)
:keyword (apply-keyword rule parser)
:table (apply-table rule parser)
(error (string "unknown parser type: " (type rule)))))
(defn name
"Returns the name of a rule. Use this instead of (get rule :name), since bare keywords are also valid rules."
[rule]
(case (type rule)
:keyword rule
:table (rule :name)))
(defn rename
"Renames a rule. By convention, rule names are keywords."
[rule name] (put rule :name name))
### As noted above: we should not use this
### Instead, combinators which might not advance should stash state
### These are: !, *, ...?
(defn test
"Tests a rule: returns whether a rule passes but does not consume any tokens."
[rule parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(restore parser origin)
result)
(defn !
"Not: negates a rule. If the rule passes, does nothing, returns `nil`. If the rule fails, advances the parser a single token, returns true. Works well only for single token rules."
[rule]
@{:name (keyword (string "!" (name rule)))
:rule (fn !* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(restore parser origin)
(if-not result
(do (advance parser) true)
nil))})
(defn <-
"Capture: if the rule matches, consumes the token and pushes it to the capture stack. If the rule does not match, does nothing."
[rule]
@{:name (keyword (string "<-" (name rule)))
:rule (fn <-* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result (capture parser))
result)})
(defn +
"Choose a rule: matches if any one of the rules is matched. Order matters: rules are evaluated in order. It's useful to put any errors last."
[& rules]
@{:name (keyword (string/join (map name rules) "+"))
:rule (fn +* [parser]
(var passing nil)
(def origin (stash parser))
(loop [rule :in rules :while (not passing)]
(restore parser origin)
(set passing (apply-rule rule parser)))
(if-not passing (restore parser origin))
passing)})
(defn unreachable [&] (error "reached the unreachable"))
(defmacro declare [& names]
(def bindings @[])
(loop [name :in names]
(def binding ~(var ,name @{:name ,(keyword name) :rule unreachable}))
(array/push bindings binding))
~(upscope ,;bindings))
(defmacro defp [name rule]
(if (dyn name)
~(set ,name (put ,name :rule (,rule :rule)))
~(var ,name (rename ,rule ,(keyword name)))))
(comment (defn panic
[rule &opt msg]
@{:name (keyword (string "panic-" (name rule)))
:rule (fn panic* [parser]
(print "panicking!")
(def origin (current parser))
(var passing false)
(def skipped @[])
(while (not passing)
(array/push skipped (current parser))
(advance parser)
(set passing (apply-rule rule parser)))
(print "looped")
(def the-error @{:type :error :token origin :msg msg :skipped skipped})
(array/push (parser :errors) the-error)
(array/push (parser :captured) the-error)
true
)}))
# If we wanted to be robust, we wouldn't hard-code this
# To consider: use a stack of "panic-until" on a parser to dynamically decide how far to panic
(defp stop (+ :newline :semicolon :break))
(defn panic
"Panics the parser, consuming all tokens until the rule matches (including the match). It also adds an error node to both the capture and the error stacks."
[parser expected]
(print "panic! in the parser")
(def origin (current parser))
(var passing false)
(def skipped @[])
(while (not passing)
(array/push skipped (current parser))
(advance parser)
(set passing (apply-rule stop parser))
(print "phew; I'm done panicking")
(pprint (current parser))
(def the-error {:type :error :expected expected :token origin :skipped skipped})
(array/push (parser :errors) the-error)
(array/push (parser :captured) the-error)
(error parser)))
(defn *
"Sequences rules: matches if all rules are matched, in sequence."
[& rules]
@{:name (keyword (string/join (map name rules) "*"))
:rule (fn ** [parser]
(print "applying sequence")
(var passing true)
(def origin (stash parser))
(pprint origin)
(var failing nil)
(var passing true)
(loop [rule :in rules :while passing]
(def pass? (apply-rule rule parser))
(when (not pass?)
(set failing rule)
(set passing false)))
# this isn't the right thing to do: don't back out, panic!
# (when (not passing) (restore parser origin))
(if passing
passing
(panic parser (string "expected " (name failing)))))})
## next problem: where do we back out to, and how?
## the answer: an "anchor" point, where we `(try) something, and if it panics, start up with another line; in a block it will be line; in a script it will be toplevel`
## maybe terms in a tuple, list, dict
## it seems noteworthy that the things we want to return to are all either "any" or "some", and all either end with a newline or a break
## they're not all either any or some (empty data structures, but lines in blocks and scripts)
## also: there's a finite number of possibilities for any of them:
## another well-formed term, or the end of the item
## consider:
(comment
(defp tuple-term (nonbinding (some separator)))
(defp tuple-expr (order-1 (<- :lparen) (any separator) (any! tuple-term separator) (:rparen)))
)
## the tuple-term can be the anchor point
## also, in this case, we panic until a *separator*, not a *terminator*
## that suggests we need a panic-until parameter, rather than the hardcoded stop
## is this right?
(comment (defn order-1 [& rules]
@{:name (keyword (string/join (map name rules) "*"))
:rule (fn order-1 [parser]
(def result (test (first rules) parser))
(if result
(apply-rule (order-0 ;rules)) ### compile error: unknown symbol order-0
nil))}))
(defn capture-group
"Takes a parser and an origin state: takes everything captured since the origin, gathers it up in a single array, pops it all off the capture stack, and pushes the gathered captures back on the stack."
[parser origin]
(def from (get origin :cap-len))
(def captured (get parser :captured))
(def grouped (array/slice captured from -1))
(pop-to captured from)
(array/push captured grouped)
captured)
(defn &
"Groups rules: pops all captured patterns and pushes them on the captured stack as a single array."
[rule]
@{:name (keyword (string "&" (name rule)))
:rule (fn &* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result
(do
(capture-group parser origin)
true)
(do
(restore parser origin)
nil)))})
(defn fn-name
"Returns the name of a function."
[f]
(def rep (string f))
(string/slice rep 10 -2))
(defn /
"Substitution: takes a rule and a function, and applies the function to any captures arising from the rule; the captures are individual parameters to the function."
[rule f]
@{:name (keyword (string (name rule) "/" (fn-name f)))
:rule (fn /* [parser]
(def origin (stash parser))
(def captured (get parser :captured))
(def result (apply-rule (& rule) parser))
(if result
(do
(def grouped (array/pop captured))
(def applied (f ;grouped))
(array/push captured applied)
true)
(do
(restore parser origin)
nil)))})
(defn any
"Matches zero or more instances of a rule."
[rule]
@{:name (keyword (string "any-" (name rule)))
:rule (fn any* [parser]
(var result true)
(while result
(set result (apply-rule rule parser)))
true)})
(defn some
"Matches one or more instances of a rule."
[rule]
@{:name (keyword (string "some-" (name rule)))
:rule (fn some* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result
(do
(apply-rule (any rule) parser)
true)
(do
(restore parser origin)
nil)))})
(defn parse [tokens rule]
(def parser (new-parser tokens))
(try
(do (apply-rule rule parser) parser)
([err] err)))
(upscope #XXX for easier repl use
(defn literal->ast [token] {:type (get token :type) :data (get token :literal) :token token})
(defn word->ast [token] {:type (get token :type) :data (get token :lexeme) :token token})
(defn tuple->ast [origin & tokens] {:type :tuple :data tokens :token origin})
(defn if->ast [origin & tokens] {:type :if :data tokens :token origin})
(declare expression simple nonbinding)
(defp separator (some (+ :newline :comma :break)))
(defp separators? (any (+ :newline :comma :break)))
(defp terminator (some (+ :newline :semicolon :break)))
(defp terminators? (any (+ :newline :semicolon :break)))
(defp nls? (any :newline))
(defp atom (/ (<- (+ :nil :true :false :number :keyword :string)) literal->ast))
(defp word (/ (<- :word) word->ast))
(defp tuple-term (* nonbinding separator))
(defp tuple-literal (/ (* (<- :lparen) separators? (any tuple-term) :rparen) tuple->ast))
(defp if-expr (/ (* (<- :if) simple nls? :then nonbinding nls? :else nonbinding) if->ast))
# (defp root (<- (+ :word :keyword)))
# (defp follow (<- (+ :keyword :arg-tuple)))
# (defp synthetic (& (* root (some follow))))
(defp simple (+ atom word tuple-literal))
(defp nonbinding (+ atom word tuple-literal))
(defp expression (+ atom word tuple-literal))
(defp line (* expression (some terminator)))
# (defp toplevel)
(defp script (some line))
(def source
``
if (:)foo then :bar else :baz
``
)
(-> (scan source) (parse if-expr) pprint)
)
(comment
Okay, so I'm trying to figure out how to manage panics in this declarative (not recursive-descent) parser.
The question is how to
)