Files
improvise/crates/improvise-formula/src/parser.rs
Edward Langley 23c7c530e3 refactor(parser): simplify tests and generator logic
Refactor formula parser tests to use more concise assert!(matches!(...))
syntax. Simplify the formula generator implementation by removing unused
expression variants and using expect() for mandatory grammar rules. Add a
regression test for hyphenated identifiers in bare names.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-26B-A4B-it-UD-Q5_K_XL.gguf)
2026-04-15 21:32:34 -07:00

1081 lines
40 KiB
Rust

use anyhow::{Result, anyhow};
use pest::Parser as _;
use pest::iterators::Pair;
use pest_derive::Parser;
use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
#[derive(Parser)]
#[grammar = "formula.pest"]
struct FormulaParser;
/// Message used by `.expect()` calls on invariants that the grammar
/// guarantees. If one of these ever panics, the grammar and the tree
/// walker are out of sync — it's a bug, not a runtime condition.
const GRAMMAR_INVARIANT: &str = "grammar invariant violated: parser out of sync with formula.pest";
/// Parse a formula string like "Profit = Revenue - Cost"
/// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
let input = raw.trim();
let formula_pair = FormulaParser::parse(Rule::formula, input)
.map_err(|e| anyhow!("{}", e))?
.next()
.expect(GRAMMAR_INVARIANT);
Ok(build_formula(formula_pair, input, target_category))
}
/// Parse a bare expression (no target, no top-level WHERE clause).
/// Fails if the input contains trailing tokens after a complete expression.
pub fn parse_expr(s: &str) -> Result<Expr> {
let input = s.trim();
let expr_eoi_pair = FormulaParser::parse(Rule::expr_eoi, input)
.map_err(|e| anyhow!("{}", e))?
.next()
.expect(GRAMMAR_INVARIANT);
Ok(build_expr(first_inner(expr_eoi_pair)))
}
// ---- tree walkers -------------------------------------------------------
//
// Every `build_*` function below operates on a pest Pair that has already
// been validated by the grammar. Invariants like "a `formula` has exactly
// one `target`" or "a `comparison` has an lhs, op, and rhs" are guaranteed
// by pest before the tree walker sees the Pair, so these functions are
// infallible. Any `.expect(GRAMMAR_INVARIANT)` in here represents a bug
// marker — if it ever fires, the grammar and the walker have diverged.
fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Formula {
let mut target = None;
let mut expr = None;
let mut filter = None;
for inner in pair.into_inner() {
let rule = inner.as_rule();
if rule == Rule::target {
target = Some(inner.as_str().trim().to_string());
} else if rule == Rule::expr {
expr = Some(build_expr(inner));
} else if rule == Rule::where_clause {
filter = Some(build_filter(inner));
}
// Rule::EOI and any silent rules are ignored.
}
Formula::new(
raw,
target.expect(GRAMMAR_INVARIANT),
target_category,
expr.expect(GRAMMAR_INVARIANT),
filter,
)
}
fn build_expr(pair: Pair<Rule>) -> Expr {
// expr = { add_expr }
build_add_expr(first_inner(pair))
}
fn build_add_expr(pair: Pair<Rule>) -> Expr {
fold_left_binop(pair, build_mul_expr, |s| {
if s == "+" {
BinOp::Add
} else {
// The grammar restricts add_op to "+" | "-".
BinOp::Sub
}
})
}
fn build_mul_expr(pair: Pair<Rule>) -> Expr {
fold_left_binop(pair, build_pow_expr, |s| {
if s == "*" {
BinOp::Mul
} else {
// The grammar restricts mul_op to "*" | "/".
BinOp::Div
}
})
}
fn build_pow_expr(pair: Pair<Rule>) -> Expr {
// pow_expr = { unary ~ (pow_op ~ unary)? }
let mut pairs = pair.into_inner();
let base = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
match pairs.next() {
None => base,
Some(_pow_op) => {
let exp = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp))
}
}
}
fn build_unary(pair: Pair<Rule>) -> Expr {
// unary = { unary_minus | primary }
let inner = first_inner(pair);
if inner.as_rule() == Rule::unary_minus {
Expr::UnaryMinus(Box::new(build_primary(first_inner(inner))))
} else {
// primary is the only other alternative.
build_primary(inner)
}
}
fn build_primary(pair: Pair<Rule>) -> Expr {
// primary = { number | agg_call | if_expr | paren_expr | ref_expr }
let inner = first_inner(pair);
let rule = inner.as_rule();
if rule == Rule::number {
Expr::Number(inner.as_str().parse().expect(GRAMMAR_INVARIANT))
} else if rule == Rule::agg_call {
build_agg_call(inner)
} else if rule == Rule::if_expr {
build_if_expr(inner)
} else if rule == Rule::paren_expr {
build_expr(first_inner(inner))
} else {
// ref_expr is the only remaining alternative.
Expr::Ref(identifier_to_string(first_inner(inner)))
}
}
fn build_agg_call(pair: Pair<Rule>) -> Expr {
// agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
let mut pairs = pair.into_inner();
let func = parse_agg_func(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
let inner_expr = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
// The only pair after expr (if any) is `inline_where`, so we can
// map it directly without checking the rule variant.
let filter = pairs.next().map(build_filter);
Expr::Agg(func, Box::new(inner_expr), filter)
}
fn parse_agg_func(s: &str) -> AggFunc {
// agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
match s.to_ascii_uppercase().as_str() {
"SUM" => AggFunc::Sum,
"AVG" => AggFunc::Avg,
"MIN" => AggFunc::Min,
"MAX" => AggFunc::Max,
// COUNT is the only remaining alternative.
_ => AggFunc::Count,
}
}
fn build_if_expr(pair: Pair<Rule>) -> Expr {
// if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
let mut pairs = pair.into_inner();
let cond = build_comparison(pairs.next().expect(GRAMMAR_INVARIANT));
let then_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
let else_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
Expr::If(Box::new(cond), Box::new(then_e), Box::new(else_e))
}
fn build_comparison(pair: Pair<Rule>) -> Expr {
// comparison = { expr ~ cmp_op ~ expr }
let mut pairs = pair.into_inner();
let lhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
let op = parse_cmp_op(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
let rhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
Expr::BinOp(op, Box::new(lhs), Box::new(rhs))
}
fn parse_cmp_op(s: &str) -> BinOp {
// cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
match s {
"=" => BinOp::Eq,
"!=" => BinOp::Ne,
"<" => BinOp::Lt,
">" => BinOp::Gt,
"<=" => BinOp::Le,
// ">=" is the only remaining alternative.
_ => BinOp::Ge,
}
}
fn build_filter(pair: Pair<Rule>) -> Filter {
// where_clause / inline_where both have shape:
// ^"WHERE" ~ identifier ~ "=" ~ filter_value
let mut pairs = pair.into_inner();
let category = identifier_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
let item = filter_value_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
Filter { category, item }
}
fn filter_value_to_string(pair: Pair<Rule>) -> String {
// filter_value = { string | pipe_quoted | bare_ident }
let inner = first_inner(pair);
let s = inner.as_str();
let rule = inner.as_rule();
if rule == Rule::string {
strip_string_quotes(s)
} else if rule == Rule::pipe_quoted {
unquote_pipe(s)
} else {
// bare_ident is the only remaining alternative.
s.to_string()
}
}
/// Convert an identifier pair (identifier, pipe_quoted, or bare_ident) to
/// its content string. Pipe-quoted identifiers have their delimiters
/// stripped and backslash escapes applied; bare identifiers are returned
/// verbatim.
fn identifier_to_string(pair: Pair<Rule>) -> String {
let s = pair.as_str();
if is_pipe_quoted(s) {
unquote_pipe(s)
} else {
s.to_string()
}
}
fn is_pipe_quoted(s: &str) -> bool {
s.len() >= 2 && s.starts_with('|') && s.ends_with('|')
}
fn strip_string_quotes(s: &str) -> String {
debug_assert!(s.len() >= 2 && s.starts_with('"') && s.ends_with('"'));
s[1..s.len() - 1].to_string()
}
/// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
/// `\\` → `\`, `\n` → newline, and any other `\X` is preserved verbatim.
/// Matches the escape semantics documented in src/persistence/improv.pest.
fn unquote_pipe(s: &str) -> String {
debug_assert!(is_pipe_quoted(s));
let inner = &s[1..s.len() - 1];
let mut out = String::with_capacity(inner.len());
let mut chars = inner.chars();
while let Some(c) = chars.next() {
if c == '\\' {
// The grammar rule `"\\" ~ ANY` guarantees that a backslash
// inside a pipe-quoted identifier is always followed by another
// character, so `chars.next()` cannot be `None` here.
let escaped = chars.next().expect(GRAMMAR_INVARIANT);
match escaped {
'|' => out.push('|'),
'\\' => out.push('\\'),
'n' => out.push('\n'),
other => {
out.push('\\');
out.push(other);
}
}
} else {
out.push(c);
}
}
out
}
// ---- small helpers ------------------------------------------------------
fn first_inner(pair: Pair<'_, Rule>) -> Pair<'_, Rule> {
pair.into_inner().next().expect(GRAMMAR_INVARIANT)
}
/// Fold a left-associative binary-operator rule of the shape
/// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
/// The `match_op` closure is infallible — the grammar guarantees the
/// operator token is one of the expected alternatives.
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Expr
where
F: FnMut(Pair<Rule>) -> Expr,
M: Fn(&str) -> BinOp,
{
let mut pairs = pair.into_inner();
let mut left = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
while let Some(op_pair) = pairs.next() {
let op = match_op(op_pair.as_str());
let right = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
left = Expr::BinOp(op, Box::new(left), Box::new(right));
}
left
}
#[cfg(test)]
mod tests {
use super::parse_formula;
use crate::{AggFunc, BinOp, Expr};
#[test]
fn parse_simple_subtraction() {
let f = parse_formula("Profit = Revenue - Cost", "Foo").unwrap();
assert_eq!(f.target, "Profit");
assert_eq!(f.target_category, "Foo");
assert!(matches!(f.expr, Expr::BinOp(BinOp::Sub, _, _)));
}
#[test]
fn parse_where_clause() {
let f = parse_formula("EastRev = Revenue WHERE Region = \"East\"", "Foo").unwrap();
assert_eq!(f.target, "EastRev");
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.category, "Region");
assert_eq!(filter.item, "East");
}
#[test]
fn parse_sum_aggregation() {
let f = parse_formula("Total = SUM(Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
}
#[test]
fn parse_avg_aggregation() {
let f = parse_formula("Avg = AVG(Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Avg, _, _)));
}
#[test]
fn parse_if_expression() {
let f = parse_formula("Capped = IF(Revenue > 1000, 1000, Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
}
#[test]
fn parse_numeric_literal() {
let f = parse_formula("Fixed = 42", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10));
}
#[test]
fn parse_chained_arithmetic() {
parse_formula("X = (A + B) * (C - D)", "Cat").unwrap();
}
#[test]
fn parse_missing_equals_returns_error() {
assert!(parse_formula("BadFormula Revenue Cost", "Cat").is_err());
}
// ── Aggregate functions ─────────────────────────────────────────────
#[test]
fn parse_min_aggregation() {
let f = parse_formula("Lo = MIN(Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Min, _, _)));
}
#[test]
fn parse_max_aggregation() {
let f = parse_formula("Hi = MAX(Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Max, _, _)));
}
#[test]
fn parse_count_aggregation() {
let f = parse_formula("N = COUNT(Revenue)", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Count, _, _)));
}
// ── Aggregate with WHERE filter ─────────────────────────────────────
#[test]
fn parse_sum_with_top_level_where_works() {
let f = parse_formula("EastTotal = SUM(Revenue) WHERE Region = \"East\"", "Foo").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.category, "Region");
assert_eq!(filter.item, "East");
}
/// Regression: WHERE inside aggregate parens must parse as the
/// aggregate's inline filter, not as a top-level WHERE clause.
#[test]
fn parse_sum_with_inline_where_filter() {
let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap();
assert!(matches!(
&f.expr,
Expr::Agg(AggFunc::Sum, inner, Some(filter))
if matches!(**inner, Expr::Ref(_))
&& filter.category == "Region"
&& filter.item == "East"
));
}
// ── Comparison operators ────────────────────────────────────────────
#[test]
fn parse_if_with_comparison_operators() {
let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
let f = parse_formula("X = IF(A < 10, A, 10)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
let f = parse_formula("X = IF(A <= 10, A, 10)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
let f = parse_formula("X = IF(A >= 10, 10, A)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
let f = parse_formula("X = IF(A = B, 1, 0)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
}
// ── Quoted strings in WHERE ─────────────────────────────────────────
#[test]
fn parse_where_with_quoted_string_inside_expression() {
let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "West Coast");
}
// ── Power operator ──────────────────────────────────────────────────
#[test]
fn parse_power_operator() {
let f = parse_formula("Sq = X ^ 2", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Pow, _, _)));
}
// ── Unary minus ─────────────────────────────────────────────────────
#[test]
fn parse_unary_minus() {
let f = parse_formula("Neg = -Revenue", "Foo").unwrap();
assert!(matches!(f.expr, Expr::UnaryMinus(_)));
}
// ── Division and multiplication ─────────────────────────────────────
#[test]
fn parse_multiplication() {
let f = parse_formula("Double = Revenue * 2", "Foo").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Mul, _, _)));
}
#[test]
fn parse_division() {
let f = parse_formula("Half = Revenue / 2", "Foo").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Div, _, _)));
}
// ── Parenthesized expression ────────────────────────────────────────
#[test]
fn parse_nested_parens() {
let f = parse_formula("X = ((A + B))", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
// ── Aggregate function name used as ref (no parens) ─────────────────
#[test]
fn parse_aggregate_name_without_parens_is_ref() {
// "SUM" without parens should be treated as a reference, not a function
let f = parse_formula("X = SUM + 1", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, _) if matches!(**lhs, Expr::Ref(_))
));
}
#[test]
fn parse_if_without_parens_is_ref() {
// "IF" without parens should be treated as a reference
let f = parse_formula("X = IF + 1", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, _) if matches!(**lhs, Expr::Ref(_))
));
}
// ── Quoted string in WHERE ──────────────────────────────────────────
#[test]
fn parse_quoted_string_in_where() {
let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "East");
}
// ── Error paths ─────────────────────────────────────────────────────
#[test]
fn parse_unexpected_token_error() {
use super::parse_expr;
// Extra tokens after a valid expression
assert!(parse_expr("1 + 2 3").is_err());
}
#[test]
fn parse_unexpected_character_error() {
use super::parse_expr;
assert!(parse_expr("@invalid").is_err());
}
#[test]
fn parse_empty_expression_error() {
use super::parse_expr;
assert!(parse_expr("").is_err());
}
// ── Multi-word identifiers must be pipe-quoted ──────────────────────
#[test]
fn multi_word_bare_identifier_is_rejected() {
// Multi-word identifiers must be pipe-quoted; bare multi-word fails
// the `bare_name`-compatible grammar rule.
assert!(parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").is_err());
}
// ── WHERE inside quotes in the expression ───────────────────────────
#[test]
fn where_inside_quotes_is_not_a_keyword() {
// A filter value containing the literal text "WHERE" is parsed as
// a string, not as a nested WHERE keyword.
let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "WHERE");
}
// ── Pipe-quoted identifiers ─────────────────────────────────────────
#[test]
fn pipe_quoted_identifier_in_expression() {
let f = parse_formula("|Total Revenue| = |Base Revenue| + Bonus", "Foo").unwrap();
assert_eq!(f.target, "|Total Revenue|");
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, rhs)
if matches!(**lhs, Expr::Ref(ref s) if s == "Base Revenue")
&& matches!(**rhs, Expr::Ref(ref s) if s == "Bonus")
));
}
#[test]
fn pipe_quoted_keyword_as_identifier() {
// A category named "WHERE" can be referenced with pipes
let f = parse_formula("X = |WHERE| + |SUM|", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, rhs)
if matches!(**lhs, Expr::Ref(ref s) if s == "WHERE")
&& matches!(**rhs, Expr::Ref(ref s) if s == "SUM")
));
}
#[test]
fn pipe_quoted_identifier_with_special_chars() {
// Pipes allow characters that would normally break tokenization
let f = parse_formula("X = |Revenue (USD)| + |Cost + Tax|", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, rhs)
if matches!(**lhs, Expr::Ref(ref s) if s == "Revenue (USD)")
&& matches!(**rhs, Expr::Ref(ref s) if s == "Cost + Tax")
));
}
#[test]
fn pipe_quoted_in_aggregate() {
let f = parse_formula("X = SUM(|Net Revenue|)", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::Agg(AggFunc::Sum, inner, None)
if matches!(**inner, Expr::Ref(ref s) if s == "Net Revenue")
));
}
#[test]
fn pipe_quoted_in_where_filter_value() {
let f = parse_formula("X = Revenue WHERE Region = |East Coast|", "Foo").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "East Coast");
}
#[test]
fn pipe_quoted_in_inline_where() {
let f =
parse_formula("X = SUM(Revenue WHERE |Region Name| = |East Coast|)", "Foo").unwrap();
assert!(matches!(
&f.expr,
Expr::Agg(AggFunc::Sum, _, Some(filter))
if filter.category == "Region Name" && filter.item == "East Coast"
));
}
// ── Pipe-quoted escape semantics ────────────────────────────────────
#[test]
fn pipe_quoted_escape_literal_pipe() {
// \| inside a pipe-quoted identifier is a literal pipe
let f = parse_formula("X = |A\\|B|", "Cat").unwrap();
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A|B"));
}
#[test]
fn pipe_quoted_escape_double_backslash() {
// \\ inside a pipe-quoted identifier is a literal backslash
let f = parse_formula("X = |A\\\\B|", "Cat").unwrap();
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\\B"));
}
#[test]
fn pipe_quoted_escape_newline() {
// \n inside a pipe-quoted identifier is a literal newline
let f = parse_formula("X = |A\\nB|", "Cat").unwrap();
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\nB"));
}
#[test]
fn pipe_quoted_unknown_escape_preserved() {
// Any `\X` where X isn't |, \, or n is preserved verbatim as
// backslash-plus-character. The grammar's `"\\" ~ ANY` allows
// any following character; we just don't interpret it.
let f = parse_formula("X = |A\\zB|", "Cat").unwrap();
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\\zB"));
}
// ── Operator precedence and associativity ──────────────────────────
#[test]
fn mul_binds_tighter_than_add() {
// `A + B * C` must parse as Add(A, Mul(B, C))
let f = parse_formula("X = A + B * C", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, rhs)
if matches!(**lhs, Expr::Ref(_))
&& matches!(**rhs, Expr::BinOp(BinOp::Mul, _, _))
));
}
#[test]
fn pow_binds_tighter_than_mul() {
// `A * B ^ C` must parse as Mul(A, Pow(B, C))
let f = parse_formula("X = A * B ^ C", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Mul, lhs, rhs)
if matches!(**lhs, Expr::Ref(_))
&& matches!(**rhs, Expr::BinOp(BinOp::Pow, _, _))
));
}
#[test]
fn subtraction_is_left_associative() {
// `A - B - C` must parse as Sub(Sub(A, B), C), not Sub(A, Sub(B, C))
let f = parse_formula("X = A - B - C", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Sub, lhs, rhs)
if matches!(**lhs, Expr::BinOp(BinOp::Sub, _, _))
&& matches!(**rhs, Expr::Ref(ref s) if s == "C")
));
}
#[test]
fn division_is_left_associative() {
// `A / B / C` must parse as Div(Div(A, B), C)
let f = parse_formula("X = A / B / C", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Div, lhs, rhs)
if matches!(**lhs, Expr::BinOp(BinOp::Div, _, _))
&& matches!(**rhs, Expr::Ref(ref s) if s == "C")
));
}
#[test]
fn unary_minus_before_pow() {
// `-A ^ B` must parse as Pow(UnaryMinus(A), B). The `-` binds
// through `unary_minus = { "-" ~ primary }`, producing a unary
// node that is then used as the pow_expr's base.
let f = parse_formula("X = -A ^ B", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Pow, lhs, rhs)
if matches!(**lhs, Expr::UnaryMinus(_))
&& matches!(**rhs, Expr::Ref(ref s) if s == "B")
));
}
// ── Number literal variants ────────────────────────────────────────
#[test]
fn integer_literal() {
let f = parse_formula("X = 42", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10));
}
#[test]
fn zero_literal() {
let f = parse_formula("X = 0", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if n == 0.0));
}
#[test]
fn decimal_literal_without_integer_part() {
let f = parse_formula("X = .5", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 0.5).abs() < 1e-10));
}
#[test]
fn decimal_literal_with_trailing_dot() {
let f = parse_formula("X = 5.", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 5.0).abs() < 1e-10));
}
#[test]
fn decimal_literal_with_integer_and_fraction() {
let f = parse_formula("X = 123.456", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Number(n) if (n - 123.456).abs() < 1e-10));
}
// ── Filter value variants ──────────────────────────────────────────
#[test]
fn where_with_bare_identifier_value() {
// filter_value = { string | pipe_quoted | bare_ident } — exercise
// the bare_ident branch.
let f = parse_formula("X = Revenue WHERE Region = East", "Cat").unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.category, "Region");
assert_eq!(filter.item, "East");
}
// ── Nested constructs ──────────────────────────────────────────────
#[test]
fn nested_sum_aggregate() {
// Nested aggregates — outer SUM wrapping an inner SUM.
let f = parse_formula("X = SUM(SUM(Revenue))", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::Agg(AggFunc::Sum, outer_inner, None)
if matches!(**outer_inner, Expr::Agg(AggFunc::Sum, _, None))
));
}
#[test]
fn deeply_nested_parens() {
// Parens should flatten away without affecting the AST.
let f = parse_formula("X = (((((A + B)))))", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn nested_if_expression() {
// IF in the then-branch of another IF.
let f = parse_formula("X = IF(A > B, IF(C > D, 1, 2), 3)", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::If(_, then_e, else_e)
if matches!(**then_e, Expr::If(_, _, _))
&& matches!(**else_e, Expr::Number(n) if n == 3.0)
));
}
// ── Whitespace tolerance ───────────────────────────────────────────
#[test]
fn tolerates_tabs_between_tokens() {
let f = parse_formula("X\t=\tA\t+\tB", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn tolerates_extra_spaces_between_tokens() {
let f = parse_formula("X = A + B", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
#[test]
fn tolerates_leading_and_trailing_whitespace() {
let f = parse_formula(" X = A + B ", "Cat").unwrap();
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
}
// ── Case insensitivity of keywords ─────────────────────────────────
#[test]
fn aggregate_function_is_case_insensitive() {
// The grammar uses ^"SUM" which is case-insensitive.
let f = parse_formula("X = sum(Revenue)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
let f = parse_formula("X = Sum(Revenue)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
}
#[test]
fn if_keyword_is_case_insensitive() {
let f = parse_formula("X = if(A > B, 1, 0)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _)));
}
#[test]
fn where_keyword_is_case_insensitive() {
let f = parse_formula("X = Revenue where Region = \"East\"", "Cat").unwrap();
assert!(f.filter.is_some());
}
// ── Target variants ────────────────────────────────────────────────
#[test]
fn target_with_underscore_and_hyphen() {
let f = parse_formula("my_target-name = A", "Cat").unwrap();
assert_eq!(f.target, "my_target-name");
}
/// Regression: the hand-rolled tokenizer didn't allow `-` inside bare
/// identifiers, so references to persistence-legal names like
/// `east-coast` were silently parsed as `east - coast` (a subtraction
/// of two unrelated Refs). The pest grammar mirrors improv.pest's
/// `bare_name`, which does allow `-`, so a hyphenated name now parses
/// as a single identifier.
#[test]
fn hyphen_in_bare_identifier_reference() {
let f = parse_formula("Total = east-coast + west-coast", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::BinOp(BinOp::Add, lhs, rhs)
if matches!(**lhs, Expr::Ref(ref s) if s == "east-coast")
&& matches!(**rhs, Expr::Ref(ref s) if s == "west-coast")
));
}
#[test]
fn hyphen_in_bare_identifier_inside_aggregate() {
// Same fix, but inside a SUM() to exercise the aggregate path as well.
let f = parse_formula("X = SUM(east-coast)", "Cat").unwrap();
assert!(matches!(
&f.expr,
Expr::Agg(AggFunc::Sum, inner, None)
if matches!(**inner, Expr::Ref(ref s) if s == "east-coast")
));
}
#[test]
fn pipe_quoted_target_preserves_pipes() {
let f = parse_formula("|My Target| = A", "Cat").unwrap();
assert_eq!(f.target, "|My Target|");
}
}
// ---- grammar-driven proptests ------------------------------------------
//
// Uses pest_meta to read formula.pest at test time and walks the grammar
// AST to generate random valid formulas. Mirrors the pattern used by the
// persistence parser tests in src/persistence/mod.rs.
#[cfg(test)]
mod generator {
use pest_meta::ast::{Expr, RuleType};
use pest_meta::parser;
use proptest::prelude::*;
use std::collections::HashMap;
/// Parse formula.pest and return rules keyed by name.
fn load_grammar() -> HashMap<String, (RuleType, Expr)> {
let grammar = include_str!("formula.pest");
let pairs = parser::parse(parser::Rule::grammar_rules, grammar)
.unwrap_or_else(|e| panic!("Bad grammar: {e}"));
let rules = parser::consume_rules(pairs).unwrap_or_else(|e| panic!("{e:?}"));
rules
.into_iter()
.map(|r| (r.name.clone(), (r.ty, r.expr)))
.collect()
}
/// Recursive string generator driven by a pest `Expr`. `choices` is
/// consumed left-to-right at every decision point; the `atomic` flag
/// controls whether sequences insert a space between their children,
/// so that non-atomic grammar rules produce whitespace-separated
/// tokens that pest's implicit WHITESPACE handling accepts.
pub struct Gen<'g> {
rules: &'g HashMap<String, (RuleType, Expr)>,
choices: Vec<u8>,
pos: usize,
}
impl<'g> Gen<'g> {
pub fn new(rules: &'g HashMap<String, (RuleType, Expr)>, choices: Vec<u8>) -> Self {
Self {
rules,
choices,
pos: 0,
}
}
fn pick(&mut self) -> u8 {
let v = self.choices.get(self.pos).copied().unwrap_or(0);
self.pos += 1;
v
}
fn emit(&mut self, expr: &Expr, out: &mut String, atomic: bool) {
match expr {
Expr::Str(s) => out.push_str(s),
Expr::Insens(s) => out.push_str(s),
Expr::Ident(name) => self.emit_ident(name, out, atomic),
Expr::Seq(a, b) => {
self.emit(a, out, atomic);
if !atomic {
out.push(' ');
}
self.emit(b, out, atomic);
}
Expr::Choice(a, b) => {
// 50/50 between the two branches of each Choice.
// Nested Choices (used by rules with 3+ alternatives)
// recurse into themselves, so deeper branches are
// chosen less often than shallower ones — good
// enough for the random-generation use case.
if self.pick().is_multiple_of(2) {
self.emit(a, out, atomic);
} else {
self.emit(b, out, atomic);
}
}
Expr::Opt(inner) => {
// ~66% chance of emitting the optional branch.
if !self.pick().is_multiple_of(3) {
self.emit(inner, out, atomic);
}
}
Expr::Rep(inner) => {
let count = self.pick() % 4;
for i in 0..count {
if i > 0 && !atomic {
out.push(' ');
}
self.emit(inner, out, atomic);
}
}
Expr::RepOnce(inner) => {
// formula.pest only uses `+` inside atomic rules
// (ASCII_DIGIT+), so inner repetitions never need
// whitespace separation.
let count = 1 + self.pick() % 3;
for _ in 0..count {
self.emit(inner, out, atomic);
}
}
// Lookaheads (NegPred, PosPred) don't emit output. Any
// other Expr variant (Range, RepExact, Push, PeekSlice,
// Skip, …) is unused by formula.pest and silently
// produces nothing; if the grammar starts using one,
// generated formulas will fail to parse and we'll know.
_ => {}
}
}
fn emit_ident(&mut self, name: &str, out: &mut String, atomic: bool) {
match name {
"ANY" | "ASCII_ALPHA" => {
out.push((b'a' + self.pick() % 26) as char);
}
"ASCII_ALPHANUMERIC" => {
if self.pick().is_multiple_of(2) {
out.push((b'a' + self.pick() % 26) as char);
} else {
out.push((b'0' + self.pick() % 10) as char);
}
}
"ASCII_DIGIT" => {
out.push((b'0' + self.pick() % 10) as char);
}
"NEWLINE" => out.push('\n'),
"SOI" | "EOI" => {}
_ => {
// Every Ident in formula.pest refers to a rule that
// exists in the grammar file — if it doesn't, the
// grammar itself failed to compile and we wouldn't be
// running tests, so `expect` here is a bug marker.
let (ty, inner) = self
.rules
.get(name)
.cloned()
.expect("rule referenced by grammar exists");
let inner_atomic =
atomic || matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
self.emit(&inner, out, inner_atomic);
}
}
}
pub fn generate(&mut self, rule_name: &str) -> String {
let mut out = String::new();
let (ty, expr) = self
.rules
.get(rule_name)
.cloned()
.expect("entry rule exists in formula.pest");
let atomic = matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
self.emit(&expr, &mut out, atomic);
out
}
}
/// Proptest strategy: generate a random valid formula by walking the
/// grammar's `formula` rule.
pub fn formula_string() -> impl Strategy<Value = String> {
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
let rules = load_grammar();
Gen::new(&rules, choices).generate("formula")
})
}
/// Proptest strategy: generate a random valid standalone expression.
pub fn expr_string() -> impl Strategy<Value = String> {
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
let rules = load_grammar();
Gen::new(&rules, choices).generate("expr_eoi")
})
}
}
#[cfg(test)]
mod grammar_prop_tests {
use super::generator;
use super::{parse_expr, parse_formula};
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(256))]
/// Every generator-produced formula parses without error.
#[test]
fn generated_formula_parses(formula in generator::formula_string()) {
let result = parse_formula(&formula, "Cat");
prop_assert!(
result.is_ok(),
"Generated formula failed to parse:\n{}\nError: {}",
formula,
result.unwrap_err()
);
}
/// Every generator-produced standalone expression parses.
#[test]
fn generated_expr_parses(expr in generator::expr_string()) {
let result = parse_expr(&expr);
prop_assert!(
result.is_ok(),
"Generated expression failed to parse:\n{}\nError: {}",
expr,
result.unwrap_err()
);
}
/// Parsing the same input twice is deterministic — the debug
/// representation of the resulting AST is identical.
#[test]
fn parse_is_deterministic(formula in generator::formula_string()) {
let r1 = parse_formula(&formula, "Cat");
let r2 = parse_formula(&formula, "Cat");
prop_assume!(r1.is_ok() && r2.is_ok());
let f1 = r1.unwrap();
let f2 = r2.unwrap();
prop_assert_eq!(&f1.target, &f2.target);
prop_assert_eq!(format!("{:?}", f1.expr), format!("{:?}", f2.expr));
}
}
}