Replace the manual tokenizer and recursive descent parser with a PEG grammar using the pest library. This migration involves introducing a formal grammar in formula.pest and updating the parser implementation to utilize the generated Pest parser with a tree-walking approach to construct the AST. The change introduces a stricter requirement for identifiers: multi-word identifiers must now be enclosed in pipe quotes (e.g., |Total Revenue|) and are no longer accepted as bare words. Tests have been updated to reflect the new parsing logic, remove tokenizer-specific tests, and verify the new pipe-quoting and escape semantics. BREAKING CHANGE: Multi-word identifiers now require pipe-quoting (e.g. |Total Revenue|) and are no longer accepted as bare words. Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
92 lines
2.9 KiB
Plaintext
92 lines
2.9 KiB
Plaintext
// Formula grammar for improvise.
|
|
//
|
|
// A formula has the form: TARGET = EXPR [WHERE filter]
|
|
// See parser.rs for the tree walker that produces a Formula AST.
|
|
//
|
|
// Identifier rules (bare_ident / pipe_quoted) mirror `bare_name` and
|
|
// `pipe_quoted` in src/persistence/improv.pest: bare identifiers are
|
|
// alphanumeric plus `_` and `-`, with no internal spaces; multi-word
|
|
// names must be pipe-quoted.
|
|
|
|
// Auto-skip horizontal whitespace between tokens in non-atomic rules.
|
|
WHITESPACE = _{ " " | "\t" }
|
|
|
|
// ---- top-level ----------------------------------------------------------
|
|
|
|
formula = { SOI ~ target ~ "=" ~ expr ~ where_clause? ~ EOI }
|
|
|
|
// The target keeps its raw text (including pipes, if any) — we capture
|
|
// the span directly rather than walking into its children.
|
|
target = { identifier }
|
|
|
|
where_clause = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
|
|
|
|
// ---- expressions --------------------------------------------------------
|
|
|
|
// Used by parse_expr() — forces a standalone expression to consume the
|
|
// whole input, so `1 + 2 3` fails instead of silently dropping " 3".
|
|
expr_eoi = { SOI ~ expr ~ EOI }
|
|
|
|
expr = { add_expr }
|
|
|
|
add_expr = { mul_expr ~ (add_op ~ mul_expr)* }
|
|
add_op = { "+" | "-" }
|
|
|
|
mul_expr = { pow_expr ~ (mul_op ~ pow_expr)* }
|
|
mul_op = { "*" | "/" }
|
|
|
|
pow_expr = { unary ~ (pow_op ~ unary)? }
|
|
pow_op = { "^" }
|
|
|
|
unary = { unary_minus | primary }
|
|
unary_minus = { "-" ~ primary }
|
|
|
|
primary = {
|
|
number
|
|
| agg_call
|
|
| if_expr
|
|
| paren_expr
|
|
| ref_expr
|
|
}
|
|
|
|
paren_expr = { "(" ~ expr ~ ")" }
|
|
|
|
// Aggregates with optional inline WHERE filter inside the parens.
|
|
agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
|
|
agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
|
|
inline_where = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
|
|
|
|
// IF(cond, then, else). Comparison is a standalone rule because comparison
|
|
// operators are not valid in general expressions — only inside an IF condition.
|
|
if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
|
|
comparison = { expr ~ cmp_op ~ expr }
|
|
cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
|
|
|
|
// A reference to an item. `SUM` and `IF` without parens fall through to
|
|
// this rule because agg_call / if_expr require a "(" and otherwise fail.
|
|
ref_expr = { identifier }
|
|
|
|
// ---- identifiers --------------------------------------------------------
|
|
//
|
|
// Mirror of improv.pest's bare_name / pipe_quoted.
|
|
|
|
identifier = ${ pipe_quoted | bare_ident }
|
|
|
|
// Backslash escapes inside pipes: \| literal pipe, \\ backslash, \n newline.
|
|
pipe_quoted = @{ "|" ~ ("\\" ~ ANY | !"|" ~ ANY)* ~ "|" }
|
|
|
|
bare_ident = @{
|
|
(ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")*
|
|
}
|
|
|
|
// ---- literal values -----------------------------------------------------
|
|
|
|
filter_value = { string | pipe_quoted | bare_ident }
|
|
|
|
string = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" }
|
|
|
|
number = @{
|
|
ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)?
|
|
| "." ~ ASCII_DIGIT+
|
|
}
|