393 lines
13 KiB
Plaintext
393 lines
13 KiB
Plaintext
(comment
|
|
So the thing here is that this is a much, much simpler version of the rather unweildy Clojure parser framework I wrote. This leans pretty hard on the Janet PEG framework, but is also bespoke for my purposes here.
|
|
|
|
This lets me parse not raw text, but scanned tokens, which makes parsing a much easier lift. Also, it lets me do things like insert :break tokens, which really helps manage collection types and blocks.
|
|
|
|
At current, I do need to shift to returning a data structure rather than `nil`s and `true`s. That way, I can advance the parser by a certain amount, however much a passing test-rule would consume.
|
|
|
|
Do I really need to optimize this? I don't think so: we can do that later. I could store various information on stacks.
|
|
|
|
Alternately, returning data structures may well complicate things: KISS!
|
|
|
|
The idea, however, is that Jesus: we must not have test-rule and apply-rule: combinators without a fixed advance amount can't work that way. So we have only `apply-rule`, which `advance`s the parser by however much should be consumed. BUT! We can always, and ought wherever we might, stash the origin index and roll back any changes (i.e. in `!` and `*`)
|
|
|
|
)
|
|
|
|
(defn pprint [x] (printf "%M" x))
|
|
|
|
(os/cd "janet") ### XXX: to remove, repl only
|
|
(import ./scanner :prefix "")
|
|
|
|
(defn new-parser
|
|
``
|
|
Creates a new parser from a list of tokens. A parser has three fields in addition to `:tokens`:
|
|
:i the index, in the list of tokens, of the current token
|
|
:errors an array of errors, treated like a stack
|
|
:captured an array of captures (tokens or arrays of tokens), treated like a stack.
|
|
``
|
|
[{:tokens tokens}]
|
|
@{:tokens tokens :i 0 :errors @[] :captured @[]})
|
|
|
|
(defn pop-to
|
|
"Pops items from a stack until it contains `len` items. Returns the stack."
|
|
[stack len]
|
|
(assert (>= len 0))
|
|
(while (< len (length stack)) (array/pop stack))
|
|
stack)
|
|
|
|
(defn popn
|
|
"Pops n items from a stack. Returns the stack."
|
|
[stack n]
|
|
(loop [_ :range [0 n]] (array/pop stack))
|
|
stack)
|
|
|
|
(defn current "Returns the current token." [{:tokens tokens :i i}] (get tokens i))
|
|
|
|
(defn previous "Returns the token before the current token." [{:tokens tokens :i i}] (get tokens (dec i)))
|
|
|
|
(defn next "Returns the token after the current token." [{:tokens tokens :i i}] (get tokens (inc i)))
|
|
|
|
(def add +) ### I `def` `+` later; stash this
|
|
|
|
(defn advance
|
|
"Advances the parser by `count`, or 1 if count is not supplied."
|
|
[parser] (update parser :i inc))
|
|
|
|
(defn stash
|
|
"Stashes a parser state: captures the current index, error, and capture stacks."
|
|
[{:i i :errors errs :captured cap}]
|
|
{:i i :err-len (length errs) :cap-len (length cap)})
|
|
|
|
(defn restore
|
|
"Restores the parser to a stashed state: resets :i, pops any new errors and captures off their stacks."
|
|
[parser state]
|
|
(def {:i i :err-len err-len :cap-len cap-len} state)
|
|
(def {:errors errors :captured captured} parser)
|
|
(put parser :i i)
|
|
(pop-to errors err-len)
|
|
(pop-to captured cap-len)
|
|
parser)
|
|
|
|
(defn at "Returns the token at index i." [parser i] (get (get parser :tokens) i))
|
|
|
|
(defn capture "Captures the last token." [parser] (update parser :captured array/push (previous parser)))
|
|
|
|
(defn apply-keyword
|
|
"Applies a keyword-based rule. Consumes the passing token."
|
|
[kw parser]
|
|
(def curr (current parser))
|
|
(def type (get curr :type))
|
|
(if (= kw type)
|
|
(do
|
|
(advance parser)
|
|
true)
|
|
nil))
|
|
|
|
(defn apply-table
|
|
"Applies a table-based rule."
|
|
[rule parser]
|
|
(def curr (current parser))
|
|
(def rule-fn (get rule :rule))
|
|
(rule-fn parser))
|
|
|
|
(defn apply-rule
|
|
"Applies a rule: if the rule passes, consume all passing tokens and return true. If the rule fails, do nothing, returning `nil`."
|
|
[rule parser]
|
|
(case (type rule)
|
|
:keyword (apply-keyword rule parser)
|
|
:table (apply-table rule parser)
|
|
(error (string "unknown parser type: " (type rule)))))
|
|
|
|
(defn name
|
|
"Returns the name of a rule. Use this instead of (get rule :name), since bare keywords are also valid rules."
|
|
[rule]
|
|
(case (type rule)
|
|
:keyword rule
|
|
:table (rule :name)))
|
|
|
|
(defn rename
|
|
"Renames a rule. By convention, rule names are keywords."
|
|
[rule name] (put rule :name name))
|
|
|
|
### As noted above: we should not use this
|
|
### Instead, combinators which might not advance should stash state
|
|
### These are: !, *, ...?
|
|
(defn test
|
|
"Tests a rule: returns whether a rule passes but does not consume any tokens."
|
|
[rule parser]
|
|
(def origin (stash parser))
|
|
(def result (apply-rule rule parser))
|
|
(restore parser origin)
|
|
result)
|
|
|
|
(defn !
|
|
"Not: negates a rule. If the rule passes, does nothing, returns `nil`. If the rule fails, advances the parser a single token, returns true. Works well only for single token rules."
|
|
[rule]
|
|
@{:name (keyword (string "!" (name rule)))
|
|
:rule (fn !* [parser]
|
|
(def origin (stash parser))
|
|
(def result (apply-rule rule parser))
|
|
(restore parser origin)
|
|
(if-not result
|
|
(do (advance parser) true)
|
|
nil))})
|
|
|
|
(defn <-
|
|
"Capture: if the rule matches, consumes the token and pushes it to the capture stack. If the rule does not match, does nothing."
|
|
[rule]
|
|
@{:name (keyword (string "<-" (name rule)))
|
|
:rule (fn <-* [parser]
|
|
(def origin (stash parser))
|
|
(def result (apply-rule rule parser))
|
|
(if result (capture parser))
|
|
result)})
|
|
|
|
(defn +
|
|
"Choose a rule: matches if any one of the rules is matched. Order matters: rules are evaluated in order. It's useful to put any errors last."
|
|
[& rules]
|
|
@{:name (keyword (string/join (map name rules) "+"))
|
|
:rule (fn +* [parser]
|
|
(var passing nil)
|
|
(def origin (stash parser))
|
|
(loop [rule :in rules :while (not passing)]
|
|
(restore parser origin)
|
|
(set passing (apply-rule rule parser)))
|
|
(if-not passing (restore parser origin))
|
|
passing)})
|
|
|
|
(defn unreachable [&] (error "reached the unreachable"))
|
|
|
|
(defmacro declare [& names]
|
|
(def bindings @[])
|
|
(loop [name :in names]
|
|
(def binding ~(var ,name @{:name ,(keyword name) :rule unreachable}))
|
|
(array/push bindings binding))
|
|
~(upscope ,;bindings))
|
|
|
|
(defmacro defp [name rule]
|
|
(if (dyn name)
|
|
~(set ,name (put ,name :rule (,rule :rule)))
|
|
~(var ,name (rename ,rule ,(keyword name)))))
|
|
|
|
(comment (defn panic
|
|
[rule &opt msg]
|
|
@{:name (keyword (string "panic-" (name rule)))
|
|
:rule (fn panic* [parser]
|
|
(print "panicking!")
|
|
(def origin (current parser))
|
|
(var passing false)
|
|
(def skipped @[])
|
|
(while (not passing)
|
|
(array/push skipped (current parser))
|
|
(advance parser)
|
|
(set passing (apply-rule rule parser)))
|
|
(print "looped")
|
|
(def the-error @{:type :error :token origin :msg msg :skipped skipped})
|
|
(array/push (parser :errors) the-error)
|
|
(array/push (parser :captured) the-error)
|
|
true
|
|
)}))
|
|
|
|
# If we wanted to be robust, we wouldn't hard-code this
|
|
# To consider: use a stack of "panic-until" on a parser to dynamically decide how far to panic
|
|
(defp stop (+ :newline :semicolon :break))
|
|
|
|
(defn panic
|
|
"Panics the parser, consuming all tokens until the rule matches (including the match). It also adds an error node to both the capture and the error stacks."
|
|
[parser expected]
|
|
(print "panic! in the parser")
|
|
(def origin (current parser))
|
|
(var passing false)
|
|
(def skipped @[])
|
|
(while (not passing)
|
|
(array/push skipped (current parser))
|
|
(advance parser)
|
|
(set passing (apply-rule stop parser))
|
|
(print "phew; I'm done panicking")
|
|
(pprint (current parser))
|
|
(def the-error {:type :error :expected expected :token origin :skipped skipped})
|
|
(array/push (parser :errors) the-error)
|
|
(array/push (parser :captured) the-error)
|
|
(error parser)))
|
|
|
|
(defn *
|
|
"Sequences rules: matches if all rules are matched, in sequence."
|
|
[& rules]
|
|
@{:name (keyword (string/join (map name rules) "*"))
|
|
:rule (fn ** [parser]
|
|
(print "applying sequence")
|
|
(var passing true)
|
|
(def origin (stash parser))
|
|
(pprint origin)
|
|
(var failing nil)
|
|
(var passing true)
|
|
(loop [rule :in rules :while passing]
|
|
(def pass? (apply-rule rule parser))
|
|
(when (not pass?)
|
|
(set failing rule)
|
|
(set passing false)))
|
|
# this isn't the right thing to do: don't back out, panic!
|
|
# (when (not passing) (restore parser origin))
|
|
(if passing
|
|
passing
|
|
(panic parser (string "expected " (name failing)))))})
|
|
## next problem: where do we back out to, and how?
|
|
## the answer: an "anchor" point, where we `(try) something, and if it panics, start up with another line; in a block it will be line; in a script it will be toplevel`
|
|
## maybe terms in a tuple, list, dict
|
|
## it seems noteworthy that the things we want to return to are all either "any" or "some", and all either end with a newline or a break
|
|
## they're not all either any or some (empty data structures, but lines in blocks and scripts)
|
|
## also: there's a finite number of possibilities for any of them:
|
|
## another well-formed term, or the end of the item
|
|
## consider:
|
|
(comment
|
|
(defp tuple-term (nonbinding (some separator)))
|
|
(defp tuple-expr (order-1 (<- :lparen) (any separator) (any! tuple-term separator) (:rparen)))
|
|
)
|
|
## the tuple-term can be the anchor point
|
|
## also, in this case, we panic until a *separator*, not a *terminator*
|
|
## that suggests we need a panic-until parameter, rather than the hardcoded stop
|
|
## is this right?
|
|
|
|
|
|
(comment (defn order-1 [& rules]
|
|
@{:name (keyword (string/join (map name rules) "*"))
|
|
:rule (fn order-1 [parser]
|
|
(def result (test (first rules) parser))
|
|
(if result
|
|
(apply-rule (order-0 ;rules)) ### compile error: unknown symbol order-0
|
|
nil))}))
|
|
|
|
(defn capture-group
|
|
"Takes a parser and an origin state: takes everything captured since the origin, gathers it up in a single array, pops it all off the capture stack, and pushes the gathered captures back on the stack."
|
|
[parser origin]
|
|
(def from (get origin :cap-len))
|
|
(def captured (get parser :captured))
|
|
(def grouped (array/slice captured from -1))
|
|
(pop-to captured from)
|
|
(array/push captured grouped)
|
|
captured)
|
|
|
|
(defn &
|
|
"Groups rules: pops all captured patterns and pushes them on the captured stack as a single array."
|
|
[rule]
|
|
@{:name (keyword (string "&" (name rule)))
|
|
:rule (fn &* [parser]
|
|
(def origin (stash parser))
|
|
(def result (apply-rule rule parser))
|
|
(if result
|
|
(do
|
|
(capture-group parser origin)
|
|
true)
|
|
(do
|
|
(restore parser origin)
|
|
nil)))})
|
|
|
|
(defn fn-name
|
|
"Returns the name of a function."
|
|
[f]
|
|
(def rep (string f))
|
|
(string/slice rep 10 -2))
|
|
|
|
(defn /
|
|
"Substitution: takes a rule and a function, and applies the function to any captures arising from the rule; the captures are individual parameters to the function."
|
|
[rule f]
|
|
@{:name (keyword (string (name rule) "/" (fn-name f)))
|
|
:rule (fn /* [parser]
|
|
(def origin (stash parser))
|
|
(def captured (get parser :captured))
|
|
(def result (apply-rule (& rule) parser))
|
|
(if result
|
|
(do
|
|
(def grouped (array/pop captured))
|
|
(def applied (f ;grouped))
|
|
(array/push captured applied)
|
|
true)
|
|
(do
|
|
(restore parser origin)
|
|
nil)))})
|
|
|
|
(defn any
|
|
"Matches zero or more instances of a rule."
|
|
[rule]
|
|
@{:name (keyword (string "any-" (name rule)))
|
|
:rule (fn any* [parser]
|
|
(var result true)
|
|
(while result
|
|
(set result (apply-rule rule parser)))
|
|
true)})
|
|
|
|
(defn some
|
|
"Matches one or more instances of a rule."
|
|
[rule]
|
|
@{:name (keyword (string "some-" (name rule)))
|
|
:rule (fn some* [parser]
|
|
(def origin (stash parser))
|
|
(def result (apply-rule rule parser))
|
|
(if result
|
|
(do
|
|
(apply-rule (any rule) parser)
|
|
true)
|
|
(do
|
|
(restore parser origin)
|
|
nil)))})
|
|
|
|
(defn parse [tokens rule]
|
|
(def parser (new-parser tokens))
|
|
(try
|
|
(do (apply-rule rule parser) parser)
|
|
([err] err)))
|
|
|
|
(upscope #XXX for easier repl use
|
|
|
|
(defn literal->ast [token] {:type (get token :type) :data (get token :literal) :token token})
|
|
|
|
(defn word->ast [token] {:type (get token :type) :data (get token :lexeme) :token token})
|
|
|
|
(defn tuple->ast [origin & tokens] {:type :tuple :data tokens :token origin})
|
|
|
|
(defn if->ast [origin & tokens] {:type :if :data tokens :token origin})
|
|
|
|
(declare expression simple nonbinding)
|
|
|
|
(defp separator (some (+ :newline :comma :break)))
|
|
(defp separators? (any (+ :newline :comma :break)))
|
|
(defp terminator (some (+ :newline :semicolon :break)))
|
|
(defp terminators? (any (+ :newline :semicolon :break)))
|
|
(defp nls? (any :newline))
|
|
|
|
(defp atom (/ (<- (+ :nil :true :false :number :keyword :string)) literal->ast))
|
|
(defp word (/ (<- :word) word->ast))
|
|
|
|
(defp tuple-term (* nonbinding separator))
|
|
(defp tuple-literal (/ (* (<- :lparen) separators? (any tuple-term) :rparen) tuple->ast))
|
|
|
|
(defp if-expr (/ (* (<- :if) simple nls? :then nonbinding nls? :else nonbinding) if->ast))
|
|
|
|
# (defp root (<- (+ :word :keyword)))
|
|
# (defp follow (<- (+ :keyword :arg-tuple)))
|
|
# (defp synthetic (& (* root (some follow))))
|
|
|
|
(defp simple (+ atom word tuple-literal))
|
|
(defp nonbinding (+ atom word tuple-literal))
|
|
(defp expression (+ atom word tuple-literal))
|
|
(defp line (* expression (some terminator)))
|
|
# (defp toplevel)
|
|
(defp script (some line))
|
|
|
|
(def source
|
|
``
|
|
if (:)foo then :bar else :baz
|
|
|
|
``
|
|
)
|
|
|
|
(-> (scan source) (parse if-expr) pprint)
|
|
)
|
|
|
|
(comment
|
|
|
|
Okay, so I'm trying to figure out how to manage panics in this declarative (not recursive-descent) parser.
|
|
The question is how to
|
|
|
|
)
|