refactor!(formula): migrate parser to use pest

Replace the manual tokenizer and recursive descent parser with a PEG
grammar using the pest library.

This migration involves introducing a formal grammar in formula.pest and
updating the parser implementation to utilize the generated Pest parser
with a tree-walking approach to construct the AST.

The change introduces a stricter requirement for identifiers: multi-word
identifiers must now be enclosed in pipe quotes (e.g., |Total Revenue|) and
are no longer accepted as bare words.

Tests have been updated to reflect the new parsing logic, remove
tokenizer-specific tests, and verify the new pipe-quoting and escape
semantics.

BREAKING CHANGE: Multi-word identifiers now require pipe-quoting (e.g. |Total Revenue|) and
are no longer accepted as bare words.
Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
This commit is contained in:
Edward Langley
2026-04-15 04:04:57 -07:00
parent 38f83b2417
commit 3f69f88709
4 changed files with 404 additions and 446 deletions

View File

@ -0,0 +1,91 @@
// Formula grammar for improvise.
//
// A formula has the form: TARGET = EXPR [WHERE filter]
// See parser.rs for the tree walker that produces a Formula AST.
//
// Identifier rules (bare_ident / pipe_quoted) mirror `bare_name` and
// `pipe_quoted` in src/persistence/improv.pest: bare identifiers are
// alphanumeric plus `_` and `-`, with no internal spaces; multi-word
// names must be pipe-quoted.
// Auto-skip horizontal whitespace between tokens in non-atomic rules.
WHITESPACE = _{ " " | "\t" }
// ---- top-level ----------------------------------------------------------
formula = { SOI ~ target ~ "=" ~ expr ~ where_clause? ~ EOI }
// The target keeps its raw text (including pipes, if any) — we capture
// the span directly rather than walking into its children.
target = { identifier }
where_clause = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
// ---- expressions --------------------------------------------------------
// Used by parse_expr() — forces a standalone expression to consume the
// whole input, so `1 + 2 3` fails instead of silently dropping " 3".
expr_eoi = { SOI ~ expr ~ EOI }
expr = { add_expr }
add_expr = { mul_expr ~ (add_op ~ mul_expr)* }
add_op = { "+" | "-" }
mul_expr = { pow_expr ~ (mul_op ~ pow_expr)* }
mul_op = { "*" | "/" }
pow_expr = { unary ~ (pow_op ~ unary)? }
pow_op = { "^" }
unary = { unary_minus | primary }
unary_minus = { "-" ~ primary }
primary = {
number
| agg_call
| if_expr
| paren_expr
| ref_expr
}
paren_expr = { "(" ~ expr ~ ")" }
// Aggregates with optional inline WHERE filter inside the parens.
agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
inline_where = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
// IF(cond, then, else). Comparison is a standalone rule because comparison
// operators are not valid in general expressions — only inside an IF condition.
if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
comparison = { expr ~ cmp_op ~ expr }
cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
// A reference to an item. `SUM` and `IF` without parens fall through to
// this rule because agg_call / if_expr require a "(" and otherwise fail.
ref_expr = { identifier }
// ---- identifiers --------------------------------------------------------
//
// Mirror of improv.pest's bare_name / pipe_quoted.
identifier = ${ pipe_quoted | bare_ident }
// Backslash escapes inside pipes: \| literal pipe, \\ backslash, \n newline.
pipe_quoted = @{ "|" ~ ("\\" ~ ANY | !"|" ~ ANY)* ~ "|" }
bare_ident = @{
(ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")*
}
// ---- literal values -----------------------------------------------------
filter_value = { string | pipe_quoted | bare_ident }
string = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" }
number = @{
ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)?
| "." ~ ASCII_DIGIT+
}