refactor(formula): trust grammar invariants in parser

Refactor the formula parser to assume grammar invariants, replacing
Result-based error handling in tree walkers with infallible functions and
.expect(GRAMMAR_INVARIANT) calls.

This simplification is based on the guarantee that the Pest grammar already
validates the input structure. To verify these invariants and ensure
correctness, extensive new unit tests and property-based tests using
proptest have been added.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
This commit is contained in:
Edward Langley
2026-04-15 04:32:14 -07:00
parent dba8a5269e
commit 5fbc73269f
3 changed files with 607 additions and 170 deletions

2
Cargo.lock generated
View File

@ -758,6 +758,8 @@ dependencies = [
"anyhow", "anyhow",
"pest", "pest",
"pest_derive", "pest_derive",
"pest_meta",
"proptest",
"serde", "serde",
] ]

View File

@ -11,3 +11,7 @@ anyhow = "1"
pest = "2.8.6" pest = "2.8.6"
pest_derive = "2.8.6" pest_derive = "2.8.6"
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
[dev-dependencies]
pest_meta = "2.8.6"
proptest = "1"

View File

@ -9,6 +9,11 @@ use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
#[grammar = "formula.pest"] #[grammar = "formula.pest"]
struct FormulaParser; struct FormulaParser;
/// Message used by `.expect()` calls on invariants that the grammar
/// guarantees. If one of these ever panics, the grammar and the tree
/// walker are out of sync — it's a bug, not a runtime condition.
const GRAMMAR_INVARIANT: &str = "grammar invariant violated: parser out of sync with formula.pest";
/// Parse a formula string like "Profit = Revenue - Cost" /// Parse a formula string like "Profit = Revenue - Cost"
/// or "Tax = Revenue * 0.08 WHERE Region = \"East\"" /// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> { pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
@ -16,221 +21,198 @@ pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
let formula_pair = FormulaParser::parse(Rule::formula, input) let formula_pair = FormulaParser::parse(Rule::formula, input)
.map_err(|e| anyhow!("{}", e))? .map_err(|e| anyhow!("{}", e))?
.next() .next()
.ok_or_else(|| anyhow!("empty parse result"))?; .expect(GRAMMAR_INVARIANT);
build_formula(formula_pair, input, target_category) Ok(build_formula(formula_pair, input, target_category))
} }
/// Parse a bare expression (no target, no top-level WHERE clause). /// Parse a bare expression (no target, no top-level WHERE clause).
/// Fails if the input contains trailing tokens after a complete expression. /// Fails if the input contains trailing tokens after a complete expression.
pub fn parse_expr(s: &str) -> Result<Expr> { pub fn parse_expr(s: &str) -> Result<Expr> {
let input = s.trim(); let input = s.trim();
let expr_pair = FormulaParser::parse(Rule::expr_eoi, input) let expr_eoi_pair = FormulaParser::parse(Rule::expr_eoi, input)
.map_err(|e| anyhow!("{}", e))? .map_err(|e| anyhow!("{}", e))?
.next() .next()
.ok_or_else(|| anyhow!("empty parse result"))? .expect(GRAMMAR_INVARIANT);
.into_inner() Ok(build_expr(first_inner(expr_eoi_pair)))
.next()
.ok_or_else(|| anyhow!("missing expression in expr_eoi"))?;
build_expr(expr_pair)
} }
// ---- tree walkers ------------------------------------------------------- // ---- tree walkers -------------------------------------------------------
//
// Every `build_*` function below operates on a pest Pair that has already
// been validated by the grammar. Invariants like "a `formula` has exactly
// one `target`" or "a `comparison` has an lhs, op, and rhs" are guaranteed
// by pest before the tree walker sees the Pair, so these functions are
// infallible. Any `.expect(GRAMMAR_INVARIANT)` in here represents a bug
// marker — if it ever fires, the grammar and the walker have diverged.
fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Result<Formula> { fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Formula {
let mut target = None; let mut target = None;
let mut expr = None; let mut expr = None;
let mut filter = None; let mut filter = None;
for inner in pair.into_inner() { for inner in pair.into_inner() {
match inner.as_rule() { let rule = inner.as_rule();
Rule::target => target = Some(inner.as_str().trim().to_string()), if rule == Rule::target {
Rule::expr => expr = Some(build_expr(inner)?), target = Some(inner.as_str().trim().to_string());
Rule::where_clause => filter = Some(build_filter(inner)?), } else if rule == Rule::expr {
Rule::EOI => {} expr = Some(build_expr(inner));
r => return Err(anyhow!("unexpected rule in formula: {:?}", r)), } else if rule == Rule::where_clause {
filter = Some(build_filter(inner));
} }
// Rule::EOI and any silent rules are ignored.
} }
let target = target.ok_or_else(|| anyhow!("missing target in formula"))?; Formula::new(
let expr = expr.ok_or_else(|| anyhow!("missing expression in formula"))?; raw,
Ok(Formula::new(raw, target, target_category, expr, filter)) target.expect(GRAMMAR_INVARIANT),
target_category,
expr.expect(GRAMMAR_INVARIANT),
filter,
)
} }
fn build_expr(pair: Pair<Rule>) -> Result<Expr> { fn build_expr(pair: Pair<Rule>) -> Expr {
// expr = { add_expr } // expr = { add_expr }
build_add_expr(first_inner(pair, "expr")?) build_add_expr(first_inner(pair))
} }
fn build_add_expr(pair: Pair<Rule>) -> Result<Expr> { fn build_add_expr(pair: Pair<Rule>) -> Expr {
fold_left_binop(pair, build_mul_expr, |s| match s { fold_left_binop(pair, build_mul_expr, |s| {
"+" => Some(BinOp::Add), if s == "+" {
"-" => Some(BinOp::Sub), BinOp::Add
_ => None, } else {
// The grammar restricts add_op to "+" | "-".
BinOp::Sub
}
}) })
} }
fn build_mul_expr(pair: Pair<Rule>) -> Result<Expr> { fn build_mul_expr(pair: Pair<Rule>) -> Expr {
fold_left_binop(pair, build_pow_expr, |s| match s { fold_left_binop(pair, build_pow_expr, |s| {
"*" => Some(BinOp::Mul), if s == "*" {
"/" => Some(BinOp::Div), BinOp::Mul
_ => None, } else {
// The grammar restricts mul_op to "*" | "/".
BinOp::Div
}
}) })
} }
fn build_pow_expr(pair: Pair<Rule>) -> Result<Expr> { fn build_pow_expr(pair: Pair<Rule>) -> Expr {
// pow_expr = { unary ~ (pow_op ~ unary)? } // pow_expr = { unary ~ (pow_op ~ unary)? }
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let base_pair = pairs let base = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
.next()
.ok_or_else(|| anyhow!("empty pow_expr"))?;
let base = build_unary(base_pair)?;
match pairs.next() { match pairs.next() {
None => Ok(base), None => base,
Some(op_pair) => { Some(_pow_op) => {
debug_assert_eq!(op_pair.as_rule(), Rule::pow_op); let exp = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
let exp_pair = pairs Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp))
.next()
.ok_or_else(|| anyhow!("missing exponent in pow_expr"))?;
let exp = build_unary(exp_pair)?;
Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)))
} }
} }
} }
fn build_unary(pair: Pair<Rule>) -> Result<Expr> { fn build_unary(pair: Pair<Rule>) -> Expr {
// unary = { unary_minus | primary } // unary = { unary_minus | primary }
let inner = first_inner(pair, "unary")?; let inner = first_inner(pair);
match inner.as_rule() { if inner.as_rule() == Rule::unary_minus {
Rule::unary_minus => { Expr::UnaryMinus(Box::new(build_primary(first_inner(inner))))
let prim = first_inner(inner, "unary_minus")?; } else {
Ok(Expr::UnaryMinus(Box::new(build_primary(prim)?))) // primary is the only other alternative.
} build_primary(inner)
Rule::primary => build_primary(inner),
r => Err(anyhow!("unexpected rule in unary: {:?}", r)),
} }
} }
fn build_primary(pair: Pair<Rule>) -> Result<Expr> { fn build_primary(pair: Pair<Rule>) -> Expr {
// primary = { number | agg_call | if_expr | paren_expr | ref_expr } // primary = { number | agg_call | if_expr | paren_expr | ref_expr }
let inner = first_inner(pair, "primary")?; let inner = first_inner(pair);
match inner.as_rule() { let rule = inner.as_rule();
Rule::number => Ok(Expr::Number(inner.as_str().parse()?)), if rule == Rule::number {
Rule::agg_call => build_agg_call(inner), Expr::Number(inner.as_str().parse().expect(GRAMMAR_INVARIANT))
Rule::if_expr => build_if_expr(inner), } else if rule == Rule::agg_call {
Rule::paren_expr => build_expr(first_inner(inner, "paren_expr")?), build_agg_call(inner)
Rule::ref_expr => { } else if rule == Rule::if_expr {
let id_pair = first_inner(inner, "ref_expr")?; build_if_expr(inner)
Ok(Expr::Ref(identifier_to_string(id_pair))) } else if rule == Rule::paren_expr {
} build_expr(first_inner(inner))
r => Err(anyhow!("unexpected rule in primary: {:?}", r)), } else {
// ref_expr is the only remaining alternative.
Expr::Ref(identifier_to_string(first_inner(inner)))
} }
} }
fn build_agg_call(pair: Pair<Rule>) -> Result<Expr> { fn build_agg_call(pair: Pair<Rule>) -> Expr {
// agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" } // agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let func_pair = pairs let func = parse_agg_func(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
.next() let inner_expr = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
.ok_or_else(|| anyhow!("missing agg_func"))?; // The only pair after expr (if any) is `inline_where`, so we can
let func = parse_agg_func(func_pair.as_str())?; // map it directly without checking the rule variant.
let expr_pair = pairs let filter = pairs.next().map(build_filter);
.next() Expr::Agg(func, Box::new(inner_expr), filter)
.ok_or_else(|| anyhow!("missing aggregate argument"))?;
let inner_expr = build_expr(expr_pair)?;
let filter = match pairs.next() {
Some(p) if p.as_rule() == Rule::inline_where => Some(build_filter(p)?),
_ => None,
};
Ok(Expr::Agg(func, Box::new(inner_expr), filter))
} }
fn parse_agg_func(s: &str) -> Result<AggFunc> { fn parse_agg_func(s: &str) -> AggFunc {
// agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
match s.to_ascii_uppercase().as_str() { match s.to_ascii_uppercase().as_str() {
"SUM" => Ok(AggFunc::Sum), "SUM" => AggFunc::Sum,
"AVG" => Ok(AggFunc::Avg), "AVG" => AggFunc::Avg,
"MIN" => Ok(AggFunc::Min), "MIN" => AggFunc::Min,
"MAX" => Ok(AggFunc::Max), "MAX" => AggFunc::Max,
"COUNT" => Ok(AggFunc::Count), // COUNT is the only remaining alternative.
f => Err(anyhow!("unknown aggregate function: {}", f)), _ => AggFunc::Count,
} }
} }
fn build_if_expr(pair: Pair<Rule>) -> Result<Expr> { fn build_if_expr(pair: Pair<Rule>) -> Expr {
// if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" } // if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let cond_pair = pairs let cond = build_comparison(pairs.next().expect(GRAMMAR_INVARIANT));
.next() let then_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
.ok_or_else(|| anyhow!("missing IF condition"))?; let else_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
let cond = build_comparison(cond_pair)?; Expr::If(Box::new(cond), Box::new(then_e), Box::new(else_e))
let then_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing IF then-branch"))?;
let then_e = build_expr(then_pair)?;
let else_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing IF else-branch"))?;
let else_e = build_expr(else_pair)?;
Ok(Expr::If(
Box::new(cond),
Box::new(then_e),
Box::new(else_e),
))
} }
fn build_comparison(pair: Pair<Rule>) -> Result<Expr> { fn build_comparison(pair: Pair<Rule>) -> Expr {
// comparison = { expr ~ cmp_op ~ expr } // comparison = { expr ~ cmp_op ~ expr }
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let lhs_pair = pairs let lhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
.next() let op = parse_cmp_op(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
.ok_or_else(|| anyhow!("missing comparison lhs"))?; let rhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
let lhs = build_expr(lhs_pair)?; Expr::BinOp(op, Box::new(lhs), Box::new(rhs))
let op_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing comparison operator"))?;
let op = parse_cmp_op(op_pair.as_str())?;
let rhs_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing comparison rhs"))?;
let rhs = build_expr(rhs_pair)?;
Ok(Expr::BinOp(op, Box::new(lhs), Box::new(rhs)))
} }
fn parse_cmp_op(s: &str) -> Result<BinOp> { fn parse_cmp_op(s: &str) -> BinOp {
// cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
match s { match s {
"=" => Ok(BinOp::Eq), "=" => BinOp::Eq,
"!=" => Ok(BinOp::Ne), "!=" => BinOp::Ne,
"<" => Ok(BinOp::Lt), "<" => BinOp::Lt,
">" => Ok(BinOp::Gt), ">" => BinOp::Gt,
"<=" => Ok(BinOp::Le), "<=" => BinOp::Le,
">=" => Ok(BinOp::Ge), // ">=" is the only remaining alternative.
o => Err(anyhow!("unknown comparison operator: {}", o)), _ => BinOp::Ge,
} }
} }
fn build_filter(pair: Pair<Rule>) -> Result<Filter> { fn build_filter(pair: Pair<Rule>) -> Filter {
// where_clause / inline_where both have shape: // where_clause / inline_where both have shape:
// ^"WHERE" ~ identifier ~ "=" ~ filter_value // ^"WHERE" ~ identifier ~ "=" ~ filter_value
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let cat_pair = pairs let category = identifier_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
.next() let item = filter_value_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
.ok_or_else(|| anyhow!("missing WHERE category"))?; Filter { category, item }
let category = identifier_to_string(cat_pair);
let val_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing WHERE value"))?;
let item = filter_value_to_string(val_pair);
Ok(Filter { category, item })
} }
fn filter_value_to_string(pair: Pair<Rule>) -> String { fn filter_value_to_string(pair: Pair<Rule>) -> String {
// filter_value = { string | pipe_quoted | bare_ident } // filter_value = { string | pipe_quoted | bare_ident }
let inner = pair let inner = first_inner(pair);
.into_inner()
.next()
.expect("filter_value must have an inner pair");
let s = inner.as_str(); let s = inner.as_str();
match inner.as_rule() { let rule = inner.as_rule();
Rule::string => strip_string_quotes(s), if rule == Rule::string {
Rule::pipe_quoted => unquote_pipe(s), strip_string_quotes(s)
_ => s.to_string(), } else if rule == Rule::pipe_quoted {
unquote_pipe(s)
} else {
// bare_ident is the only remaining alternative.
s.to_string()
} }
} }
@ -257,8 +239,8 @@ fn strip_string_quotes(s: &str) -> String {
} }
/// Strip surrounding pipes and apply backslash escapes: `\|` → `|`, /// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
/// `\\` → `\`, `\n` → newline. Matches the escape semantics documented /// `\\` → `\`, `\n` → newline, and any other `\X` is preserved verbatim.
/// in src/persistence/improv.pest. /// Matches the escape semantics documented in src/persistence/improv.pest.
fn unquote_pipe(s: &str) -> String { fn unquote_pipe(s: &str) -> String {
debug_assert!(is_pipe_quoted(s)); debug_assert!(is_pipe_quoted(s));
let inner = &s[1..s.len() - 1]; let inner = &s[1..s.len() - 1];
@ -266,15 +248,18 @@ fn unquote_pipe(s: &str) -> String {
let mut chars = inner.chars(); let mut chars = inner.chars();
while let Some(c) = chars.next() { while let Some(c) = chars.next() {
if c == '\\' { if c == '\\' {
match chars.next() { // The grammar rule `"\\" ~ ANY` guarantees that a backslash
Some('|') => out.push('|'), // inside a pipe-quoted identifier is always followed by another
Some('\\') => out.push('\\'), // character, so `chars.next()` cannot be `None` here.
Some('n') => out.push('\n'), let escaped = chars.next().expect(GRAMMAR_INVARIANT);
Some(other) => { match escaped {
'|' => out.push('|'),
'\\' => out.push('\\'),
'n' => out.push('\n'),
other => {
out.push('\\'); out.push('\\');
out.push(other); out.push(other);
} }
None => out.push('\\'),
} }
} else { } else {
out.push(c); out.push(c);
@ -285,35 +270,27 @@ fn unquote_pipe(s: &str) -> String {
// ---- small helpers ------------------------------------------------------ // ---- small helpers ------------------------------------------------------
fn first_inner<'a>(pair: Pair<'a, Rule>, ctx: &str) -> Result<Pair<'a, Rule>> { fn first_inner(pair: Pair<'_, Rule>) -> Pair<'_, Rule> {
pair.into_inner() pair.into_inner().next().expect(GRAMMAR_INVARIANT)
.next()
.ok_or_else(|| anyhow!("empty rule: {}", ctx))
} }
/// Fold a left-associative binary-operator rule of the shape /// Fold a left-associative binary-operator rule of the shape
/// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree. /// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Result<Expr> /// The `match_op` closure is infallible — the grammar guarantees the
/// operator token is one of the expected alternatives.
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Expr
where where
F: FnMut(Pair<Rule>) -> Result<Expr>, F: FnMut(Pair<Rule>) -> Expr,
M: Fn(&str) -> Option<BinOp>, M: Fn(&str) -> BinOp,
{ {
let mut pairs = pair.into_inner(); let mut pairs = pair.into_inner();
let first = pairs let mut left = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
.next()
.ok_or_else(|| anyhow!("empty binop rule"))?;
let mut left = build_child(first)?;
while let Some(op_pair) = pairs.next() { while let Some(op_pair) = pairs.next() {
let op = match_op(op_pair.as_str()).ok_or_else(|| { let op = match_op(op_pair.as_str());
anyhow!("unexpected operator token: {:?}", op_pair.as_str()) let right = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
})?;
let right_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing rhs for operator"))?;
let right = build_child(right_pair)?;
left = Expr::BinOp(op, Box::new(left), Box::new(right)); left = Expr::BinOp(op, Box::new(left), Box::new(right));
} }
Ok(left) left
} }
#[cfg(test)] #[cfg(test)]
@ -636,4 +613,458 @@ mod tests {
panic!("Expected Ref, got: {:?}", f.expr); panic!("Expected Ref, got: {:?}", f.expr);
} }
} }
#[test]
fn pipe_quoted_escape_double_backslash() {
// \\ inside a pipe-quoted identifier is a literal backslash
let f = parse_formula("X = |A\\\\B|", "Cat").unwrap();
if let Expr::Ref(ref s) = f.expr {
assert_eq!(s, "A\\B");
} else {
panic!("Expected Ref, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_escape_newline() {
// \n inside a pipe-quoted identifier is a literal newline
let f = parse_formula("X = |A\\nB|", "Cat").unwrap();
if let Expr::Ref(ref s) = f.expr {
assert_eq!(s, "A\nB");
} else {
panic!("Expected Ref, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_unknown_escape_preserved() {
// Any `\X` where X isn't |, \, or n is preserved verbatim as
// backslash-plus-character. The grammar's `"\\" ~ ANY` allows
// any following character; we just don't interpret it.
let f = parse_formula("X = |A\\zB|", "Cat").unwrap();
if let Expr::Ref(ref s) = f.expr {
assert_eq!(s, "A\\zB");
} else {
panic!("Expected Ref, got: {:?}", f.expr);
}
}
// ── Operator precedence and associativity ──────────────────────────
#[test]
fn mul_binds_tighter_than_add() {
// `A + B * C` must parse as Add(A, Mul(B, C))
let f = parse_formula("X = A + B * C", "Cat").unwrap();
if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::Ref(_)));
assert!(matches!(**rhs, Expr::BinOp(BinOp::Mul, _, _)));
} else {
panic!("Expected Add with Mul on rhs, got: {:?}", f.expr);
}
}
#[test]
fn pow_binds_tighter_than_mul() {
// `A * B ^ C` must parse as Mul(A, Pow(B, C))
let f = parse_formula("X = A * B ^ C", "Cat").unwrap();
if let Expr::BinOp(BinOp::Mul, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::Ref(_)));
assert!(matches!(**rhs, Expr::BinOp(BinOp::Pow, _, _)));
} else {
panic!("Expected Mul with Pow on rhs, got: {:?}", f.expr);
}
}
#[test]
fn subtraction_is_left_associative() {
// `A - B - C` must parse as Sub(Sub(A, B), C), not Sub(A, Sub(B, C))
let f = parse_formula("X = A - B - C", "Cat").unwrap();
if let Expr::BinOp(BinOp::Sub, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::BinOp(BinOp::Sub, _, _)));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "C"));
} else {
panic!("Expected left-associative Sub, got: {:?}", f.expr);
}
}
#[test]
fn division_is_left_associative() {
// `A / B / C` must parse as Div(Div(A, B), C)
let f = parse_formula("X = A / B / C", "Cat").unwrap();
if let Expr::BinOp(BinOp::Div, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::BinOp(BinOp::Div, _, _)));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "C"));
} else {
panic!("Expected left-associative Div, got: {:?}", f.expr);
}
}
#[test]
fn unary_minus_before_pow() {
// `-A ^ B` — hand-rolled parser parses this as UnaryMinus(Ref(A))
// followed by nothing because pow is inside unary's recursion.
// My grammar: unary = unary_minus | primary, unary_minus = "-" ~ primary.
// So `-A` is UnaryMinus(Ref(A)); `^B` can't attach because pow_expr
// sits OUTSIDE unary. Result: pow_expr(unary_minus(A)) — no pow
// because the optional pow_op doesn't have another unary to bind to.
// Verify: `-A ^ B` parses to BinOp(Pow, UnaryMinus(A), B).
let f = parse_formula("X = -A ^ B", "Cat").unwrap();
if let Expr::BinOp(BinOp::Pow, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::UnaryMinus(_)));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "B"));
} else {
panic!("Expected Pow(UnaryMinus, Ref), got: {:?}", f.expr);
}
}
// ── Number literal variants ────────────────────────────────────────
#[test]
fn integer_literal() {
let f = parse_formula("X = 42", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10));
}
#[test]
fn zero_literal() {
let f = parse_formula("X = 0", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if n == 0.0));
}
#[test]
fn decimal_literal_without_integer_part() {
let f = parse_formula("X = .5", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 0.5).abs() < 1e-10));
}
#[test]
fn decimal_literal_with_trailing_dot() {
let f = parse_formula("X = 5.", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 5.0).abs() < 1e-10));
}
#[test]
fn decimal_literal_with_integer_and_fraction() {
let f = parse_formula("X = 123.456", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 123.456).abs() < 1e-10));
}
// ── Filter value variants ──────────────────────────────────────────
#[test]
fn where_with_bare_identifier_value() {
// filter_value = { string | pipe_quoted | bare_ident } — exercise
// the bare_ident branch.
let f = parse_formula("X = Revenue WHERE Region = East", "Cat").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.category, "Region");
assert_eq!(filter.item, "East");
}
// ── Nested constructs ──────────────────────────────────────────────
#[test]
fn nested_sum_aggregate() {
// Nested aggregates — outer SUM wrapping an inner SUM.
let f = parse_formula("X = SUM(SUM(Revenue))", "Cat").unwrap();
if let Expr::Agg(AggFunc::Sum, outer_inner, None) = &f.expr {
assert!(matches!(**outer_inner, Expr::Agg(AggFunc::Sum, _, None)));
} else {
panic!("Expected nested SUM aggregates, got: {:?}", f.expr);
}
}
#[test]
fn deeply_nested_parens() {
// Parens should flatten away without affecting the AST.
let f = parse_formula("X = (((((A + B)))))", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn nested_if_expression() {
// IF in the then-branch of another IF.
let f = parse_formula("X = IF(A > B, IF(C > D, 1, 2), 3)", "Cat").unwrap();
if let Expr::If(_, then_e, else_e) = &f.expr {
assert!(matches!(**then_e, Expr::If(_, _, _)));
assert!(matches!(**else_e, Expr::Number(n) if n == 3.0));
} else {
panic!("Expected outer IF, got: {:?}", f.expr);
}
}
// ── Whitespace tolerance ───────────────────────────────────────────
#[test]
fn tolerates_tabs_between_tokens() {
let f = parse_formula("X\t=\tA\t+\tB", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn tolerates_extra_spaces_between_tokens() {
let f = parse_formula("X = A + B", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn tolerates_leading_and_trailing_whitespace() {
let f = parse_formula(" X = A + B ", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
// ── Case insensitivity of keywords ─────────────────────────────────
#[test]
fn aggregate_function_is_case_insensitive() {
// The grammar uses ^"SUM" which is case-insensitive.
let f = parse_formula("X = sum(Revenue)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
let f = parse_formula("X = Sum(Revenue)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
}
#[test]
fn if_keyword_is_case_insensitive() {
let f = parse_formula("X = if(A > B, 1, 0)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
}
#[test]
fn where_keyword_is_case_insensitive() {
let f = parse_formula("X = Revenue where Region = \"East\"", "Cat").unwrap();
assert!(f.filter.is_some());
}
// ── Target variants ────────────────────────────────────────────────
#[test]
fn target_with_underscore_and_hyphen() {
let f = parse_formula("my_target-name = A", "Cat").unwrap();
assert_eq!(f.target, "my_target-name");
}
#[test]
fn pipe_quoted_target_preserves_pipes() {
let f = parse_formula("|My Target| = A", "Cat").unwrap();
assert_eq!(f.target, "|My Target|");
}
}
// ---- grammar-driven proptests ------------------------------------------
//
// Uses pest_meta to read formula.pest at test time and walks the grammar
// AST to generate random valid formulas. Mirrors the pattern used by the
// persistence parser tests in src/persistence/mod.rs.
#[cfg(test)]
mod generator {
use pest_meta::ast::{Expr, RuleType};
use pest_meta::parser;
use proptest::prelude::*;
use std::collections::HashMap;
/// Parse formula.pest and return rules keyed by name.
fn load_grammar() -> HashMap<String, (RuleType, Expr)> {
let grammar = include_str!("formula.pest");
let pairs = parser::parse(parser::Rule::grammar_rules, grammar)
.unwrap_or_else(|e| panic!("Bad grammar: {e}"));
let rules = parser::consume_rules(pairs).unwrap_or_else(|e| panic!("{e:?}"));
rules
.into_iter()
.map(|r| (r.name.clone(), (r.ty, r.expr)))
.collect()
}
/// Recursive string generator driven by a pest `Expr`. `choices` is
/// consumed left-to-right at every decision point; the `atomic` flag
/// controls whether sequences insert a space between their children,
/// so that non-atomic grammar rules produce whitespace-separated
/// tokens that pest's implicit WHITESPACE handling accepts.
pub struct Gen<'g> {
rules: &'g HashMap<String, (RuleType, Expr)>,
choices: Vec<u8>,
pos: usize,
}
impl<'g> Gen<'g> {
pub fn new(rules: &'g HashMap<String, (RuleType, Expr)>, choices: Vec<u8>) -> Self {
Self {
rules,
choices,
pos: 0,
}
}
fn pick(&mut self) -> u8 {
let v = self.choices.get(self.pos).copied().unwrap_or(0);
self.pos += 1;
v
}
fn emit(&mut self, expr: &Expr, out: &mut String, atomic: bool) {
match expr {
Expr::Str(s) => out.push_str(s),
Expr::Insens(s) => out.push_str(s),
Expr::Range(lo, hi) => {
let lo = lo.chars().next().unwrap() as u32;
let hi = hi.chars().next().unwrap() as u32;
let range = hi - lo + 1;
let ch = char::from_u32(lo + (self.pick() as u32 % range)).unwrap();
out.push(ch);
}
Expr::Ident(name) => self.emit_ident(name, out, atomic),
Expr::Seq(a, b) => {
self.emit(a, out, atomic);
if !atomic {
out.push(' ');
}
self.emit(b, out, atomic);
}
Expr::Choice(a, b) => {
let mut alts: Vec<&Expr> = vec![a.as_ref()];
let mut cur = b.as_ref();
while let Expr::Choice(l, r) = cur {
alts.push(l.as_ref());
cur = r.as_ref();
}
alts.push(cur);
let idx = self.pick() as usize % alts.len();
self.emit(alts[idx], out, atomic);
}
Expr::Opt(inner) => {
// ~66% chance of emitting
if !self.pick().is_multiple_of(3) {
self.emit(inner, out, atomic);
}
}
Expr::Rep(inner) => {
let count = self.pick() % 4;
for i in 0..count {
if i > 0 && !atomic {
out.push(' ');
}
self.emit(inner, out, atomic);
}
}
Expr::RepOnce(inner) => {
let count = 1 + self.pick() % 3;
for i in 0..count {
if i > 0 && !atomic {
out.push(' ');
}
self.emit(inner, out, atomic);
}
}
Expr::NegPred(_) | Expr::PosPred(_) => {
// Lookaheads don't emit output.
}
_ => {
// Skip unsupported expressions.
}
}
}
fn emit_ident(&mut self, name: &str, out: &mut String, atomic: bool) {
match name {
"ANY" | "ASCII_ALPHA" => {
out.push((b'a' + self.pick() % 26) as char);
}
"ASCII_ALPHANUMERIC" => {
if self.pick().is_multiple_of(2) {
out.push((b'a' + self.pick() % 26) as char);
} else {
out.push((b'0' + self.pick() % 10) as char);
}
}
"ASCII_DIGIT" => {
out.push((b'0' + self.pick() % 10) as char);
}
"NEWLINE" => out.push('\n'),
"SOI" | "EOI" => {}
_ => {
if let Some((ty, inner)) = self.rules.get(name).cloned() {
let inner_atomic = atomic
|| matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
self.emit(&inner, out, inner_atomic);
}
}
}
}
pub fn generate(&mut self, rule_name: &str) -> String {
let mut out = String::new();
if let Some((ty, expr)) = self.rules.get(rule_name).cloned() {
let atomic = matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
self.emit(&expr, &mut out, atomic);
}
out
}
}
/// Proptest strategy: generate a random valid formula by walking the
/// grammar's `formula` rule.
pub fn formula_string() -> impl Strategy<Value = String> {
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
let rules = load_grammar();
Gen::new(&rules, choices).generate("formula")
})
}
/// Proptest strategy: generate a random valid standalone expression.
pub fn expr_string() -> impl Strategy<Value = String> {
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
let rules = load_grammar();
Gen::new(&rules, choices).generate("expr_eoi")
})
}
}
#[cfg(test)]
mod grammar_prop_tests {
use super::generator;
use super::{parse_expr, parse_formula};
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(256))]
/// Every generator-produced formula parses without error.
#[test]
fn generated_formula_parses(formula in generator::formula_string()) {
let result = parse_formula(&formula, "Cat");
prop_assert!(
result.is_ok(),
"Generated formula failed to parse:\n{}\nError: {}",
formula,
result.unwrap_err()
);
}
/// Every generator-produced standalone expression parses.
#[test]
fn generated_expr_parses(expr in generator::expr_string()) {
let result = parse_expr(&expr);
prop_assert!(
result.is_ok(),
"Generated expression failed to parse:\n{}\nError: {}",
expr,
result.unwrap_err()
);
}
/// Parsing the same input twice is deterministic — the debug
/// representation of the resulting AST is identical.
#[test]
fn parse_is_deterministic(formula in generator::formula_string()) {
let r1 = parse_formula(&formula, "Cat");
let r2 = parse_formula(&formula, "Cat");
prop_assume!(r1.is_ok() && r2.is_ok());
let f1 = r1.unwrap();
let f2 = r2.unwrap();
prop_assert_eq!(&f1.target, &f2.target);
prop_assert_eq!(format!("{:?}", f1.expr), format!("{:?}", f2.expr));
}
}
} }