ludus/janet/myparser.janet
2024-04-28 18:13:49 -04:00

394 lines
13 KiB
Plaintext

(comment
So the thing here is that this is a much, much simpler version of the rather unweildy Clojure parser framework I wrote. This leans pretty hard on the Janet PEG framework, but is also bespoke for my purposes here.
This lets me parse not raw text, but scanned tokens, which makes parsing a much easier lift. Also, it lets me do things like insert :break tokens, which really helps manage collection types and blocks.
At current, I do need to shift to returning a data structure rather than `nil`s and `true`s. That way, I can advance the parser by a certain amount, however much a passing test-rule would consume.
Do I really need to optimize this? I don't think so: we can do that later. I could store various information on stacks.
Alternately, returning data structures may well complicate things: KISS!
The idea, however, is that Jesus: we must not have test-rule and apply-rule: combinators without a fixed advance amount can't work that way. So we have only `apply-rule`, which `advance`s the parser by however much should be consumed. BUT! We can always, and ought wherever we might, stash the origin index and roll back any changes (i.e. in `!` and `*`)
)
(defn pprint [x] (printf "%M" x))
(os/cd "janet") ### XXX: to remove, repl only
(import ./scanner :prefix "")
(defn new-parser
``
Creates a new parser from a list of tokens. A parser has three fields in addition to `:tokens`:
:i the index, in the list of tokens, of the current token
:errors an array of errors, treated like a stack
:captured an array of captures (tokens or arrays of tokens), treated like a stack.
``
[{:tokens tokens}]
@{:tokens tokens :i 0 :errors @[] :captured @[]})
(defn pop-to
"Pops items from a stack until it contains `len` items. Returns the stack."
[stack len]
(assert (>= len 0))
(while (< len (length stack)) (array/pop stack))
stack)
(defn popn
"Pops n items from a stack. Returns the stack."
[stack n]
(loop [_ :range [0 n]] (array/pop stack))
stack)
(defn current "Returns the current token." [{:tokens tokens :i i}] (get tokens i))
(defn previous "Returns the token before the current token." [{:tokens tokens :i i}] (get tokens (dec i)))
(defn next "Returns the token after the current token." [{:tokens tokens :i i}] (get tokens (inc i)))
(def add +) ### I `def` `+` later; stash this
(defn advance
"Advances the parser by `count`, or 1 if count is not supplied."
[parser] (update parser :i inc))
(defn stash
"Stashes a parser state: captures the current index, error, and capture stacks."
[{:i i :errors errs :captured cap}]
{:i i :err-len (length errs) :cap-len (length cap)})
(defn restore
"Restores the parser to a stashed state: resets :i, pops any new errors and captures off their stacks."
[parser state]
(def {:i i :err-len err-len :cap-len cap-len} state)
(def {:errors errors :captured captured} parser)
(put parser :i i)
(pop-to errors err-len)
(pop-to captured cap-len)
parser)
(defn at "Returns the token at index i." [parser i] (get (get parser :tokens) i))
(defn capture "Captures the last token." [parser] (update parser :captured array/push (previous parser)))
(defn apply-keyword
"Applies a keyword-based rule. Consumes the passing token."
[kw parser]
(def curr (current parser))
(def type (get curr :type))
(if (= kw type)
(do
(advance parser)
true)
nil))
(defn apply-table
"Applies a table-based rule."
[rule parser]
(def curr (current parser))
(def rule-fn (get rule :rule))
(rule-fn parser))
(defn apply-rule
"Applies a rule: if the rule passes, consume all passing tokens and return true. If the rule fails, do nothing, returning `nil`."
[rule parser]
(case (type rule)
:keyword (apply-keyword rule parser)
:table (apply-table rule parser)
(error (string "unknown parser type: " (type rule)))))
(defn name
"Returns the name of a rule. Use this instead of (get rule :name), since bare keywords are also valid rules."
[rule]
(case (type rule)
:keyword rule
:table (rule :name)))
(defn rename
"Renames a rule. By convention, rule names are keywords."
[rule name] (put rule :name name))
### As noted above: we should not use this
### Instead, combinators which might not advance should stash state
### These are: !, *, ...?
(defn test
"Tests a rule: returns whether a rule passes but does not consume any tokens."
[rule parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(restore parser origin)
result)
(defn !
"Not: negates a rule. If the rule passes, does nothing, returns `nil`. If the rule fails, advances the parser a single token, returns true. Works well only for single token rules."
[rule]
@{:name (keyword (string "!" (name rule)))
:rule (fn !* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(restore parser origin)
(if-not result
(do (advance parser) true)
nil))})
(defn <-
"Capture: if the rule matches, consumes the token and pushes it to the capture stack. If the rule does not match, does nothing."
[rule]
@{:name (keyword (string "<-" (name rule)))
:rule (fn <-* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result (capture parser))
result)})
(defn +
"Choose a rule: matches if any one of the rules is matched. Order matters: rules are evaluated in order. It's useful to put any errors last."
[& rules]
@{:name (keyword (string/join (map name rules) "+"))
:rule (fn +* [parser]
(var passing nil)
(def origin (stash parser))
(loop [rule :in rules :while (not passing)]
(restore parser origin)
(set passing (apply-rule rule parser)))
(if-not passing (restore parser origin))
passing)})
(defn unreachable [&] (error "reached the unreachable"))
(defmacro declare [& names]
(def bindings @[])
(loop [name :in names]
(def binding ~(var ,name @{:name ,(keyword name) :rule unreachable}))
(array/push bindings binding))
~(upscope ,;bindings))
(defmacro defp [name rule]
(if (dyn name)
~(set ,name (put ,name :rule (,rule :rule)))
~(var ,name (rename ,rule ,(keyword name)))))
(comment (defn panic
[rule &opt msg]
@{:name (keyword (string "panic-" (name rule)))
:rule (fn panic* [parser]
(print "panicking!")
(def origin (current parser))
(var passing false)
(def skipped @[])
(while (not passing)
(array/push skipped (current parser))
(advance parser)
(set passing (apply-rule rule parser)))
(print "looped")
(def the-error @{:type :error :token origin :msg msg :skipped skipped})
(array/push (parser :errors) the-error)
(array/push (parser :captured) the-error)
true
)}))
# If we wanted to be robust, we wouldn't hard-code this
# To consider: use a stack of "panic-until" on a parser to dynamically decide how far to panic
(defp stop (+ :newline :semicolon :break))
(defn panic
"Panics the parser, consuming all tokens until the rule matches (including the match). It also adds an error node to both the capture and the error stacks."
[parser expected]
(print "panic! in the parser")
(def origin (current parser))
(var passing false)
(def skipped @[])
(while (not passing)
(array/push skipped (current parser))
(advance parser)
(set passing (apply-rule stop parser))
(print "phew; I'm done panicking")
(pprint (current parser))
(def the-error {:type :error :expected expected :token origin :skipped skipped})
(array/push (parser :errors) the-error)
(array/push (parser :captured) the-error)
(error parser)))
(defn *
"Sequences rules: matches if all rules are matched, in sequence."
[& rules]
@{:name (keyword (string/join (map name rules) "*"))
:rule (fn ** [parser]
(print "applying sequence")
(var passing true)
(def origin (stash parser))
(pprint origin)
(var failing nil)
(var passing true)
(loop [rule :in rules :while passing]
(def pass? (apply-rule rule parser))
(when (not pass?)
(set failing rule)
(set passing false)))
# this isn't the right thing to do: don't back out, panic!
# (when (not passing) (restore parser origin))
(if passing
passing
(panic parser (string "expected " (name failing)))))})
## next problem: where do we back out to, and how?
## the answer: an "anchor" point, where we `(try) something, and if it panics, start up with another line; in a block it will be line; in a script it will be toplevel`
## maybe terms in a tuple, list, dict
## it seems noteworthy that the things we want to return to are all either "any" or "some", and all either end with a newline or a break
## they're not all either any or some (empty data structures, but lines in blocks and scripts)
## also: there's a finite number of possibilities for any of them:
## another well-formed term, or the end of the item
## consider:
(comment
(defp tuple-term (nonbinding (some separator)))
(defp tuple-expr (order-1 (<- :lparen) (any separator) (any! tuple-term separator) (:rparen)))
)
## the tuple-term can be the anchor point
## also, in this case, we panic until a *separator*, not a *terminator*
## that suggests we need a panic-until parameter, rather than the hardcoded stop
## is this right?
(comment (defn order-1 [& rules]
@{:name (keyword (string/join (map name rules) "*"))
:rule (fn order-1 [parser]
(def result (test (first rules) parser))
(if result
(apply-rule (order-0 ;rules)) ### compile error: unknown symbol order-0
nil))}))
(defn capture-group
"Takes a parser and an origin state: takes everything captured since the origin, gathers it up in a single array, pops it all off the capture stack, and pushes the gathered captures back on the stack."
[parser origin]
(def from (get origin :cap-len))
(def captured (get parser :captured))
(def grouped (array/slice captured from -1))
(pop-to captured from)
(array/push captured grouped)
captured)
(defn &
"Groups rules: pops all captured patterns and pushes them on the captured stack as a single array."
[rule]
@{:name (keyword (string "&" (name rule)))
:rule (fn &* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result
(do
(capture-group parser origin)
true)
(do
(restore parser origin)
nil)))})
(defn fn-name
"Returns the name of a function."
[f]
(def rep (string f))
(string/slice rep 10 -2))
(defn /
"Substitution: takes a rule and a function, and applies the function to any captures arising from the rule; the captures are individual parameters to the function."
[rule f]
@{:name (keyword (string (name rule) "/" (fn-name f)))
:rule (fn /* [parser]
(def origin (stash parser))
(def captured (get parser :captured))
(def result (apply-rule (& rule) parser))
(if result
(do
(def grouped (array/pop captured))
(def applied (f ;grouped))
(array/push captured applied)
true)
(do
(restore parser origin)
nil)))})
(defn any
"Matches zero or more instances of a rule."
[rule]
@{:name (keyword (string "any-" (name rule)))
:rule (fn any* [parser]
(var result true)
(while result
(set result (apply-rule rule parser)))
true)})
(defn some
"Matches one or more instances of a rule."
[rule]
@{:name (keyword (string "some-" (name rule)))
:rule (fn some* [parser]
(def origin (stash parser))
(def result (apply-rule rule parser))
(if result
(do
(apply-rule (any rule) parser)
true)
(do
(restore parser origin)
nil)))})
(defn parse [tokens rule]
(def parser (new-parser tokens))
(try
(do (apply-rule rule parser) parser)
([err] err)))
(upscope #XXX for easier repl use
(defn literal->ast [token] {:type (get token :type) :data (get token :literal) :token token})
(defn word->ast [token] {:type (get token :type) :data (get token :lexeme) :token token})
(defn tuple->ast [origin & tokens] {:type :tuple :data tokens :token origin})
(defn if->ast [origin & tokens] {:type :if :data tokens :token origin})
(declare expression simple nonbinding)
(defp separator (some (+ :newline :comma :break)))
(defp separators? (any (+ :newline :comma :break)))
(defp terminator (some (+ :newline :semicolon :break)))
(defp terminators? (any (+ :newline :semicolon :break)))
(defp nls? (any :newline))
(defp atom (/ (<- (+ :nil :true :false :number :keyword :string)) literal->ast))
(defp word (/ (<- :word) word->ast))
(defp tuple-term (* nonbinding separator))
(defp tuple-literal (/ (* (<- :lparen) separators? (any tuple-term) :rparen) tuple->ast))
(defp if-expr (/ (* (<- :if) simple nls? :then nonbinding nls? :else nonbinding) if->ast))
# (defp root (<- (+ :word :keyword)))
# (defp follow (<- (+ :keyword :arg-tuple)))
# (defp synthetic (& (* root (some follow))))
(defp simple (+ atom word tuple-literal))
(defp nonbinding (+ atom word tuple-literal))
(defp expression (+ atom word tuple-literal))
(defp line (* expression (some terminator)))
# (defp toplevel)
(defp script (some line))
(def source
``
if (:)foo then :bar else :baz
``
)
(-> (scan source) (parse if-expr) pprint)
)
(comment
Okay, so I'm trying to figure out how to manage panics in this declarative (not recursive-descent) parser.
The question is how to do the thing where we pop out of the stack up to the most relevant rule.
Given that we may have deeply nested rules, we either want:
- A stack of rules in the parser, which
)