diff --git a/Cargo.lock b/Cargo.lock index ad2f76e..b2565b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -758,6 +758,8 @@ dependencies = [ "anyhow", "pest", "pest_derive", + "pest_meta", + "proptest", "serde", ] diff --git a/crates/improvise-formula/Cargo.toml b/crates/improvise-formula/Cargo.toml index 7782815..f7c7a9c 100644 --- a/crates/improvise-formula/Cargo.toml +++ b/crates/improvise-formula/Cargo.toml @@ -11,3 +11,7 @@ anyhow = "1" pest = "2.8.6" pest_derive = "2.8.6" serde = { version = "1", features = ["derive"] } + +[dev-dependencies] +pest_meta = "2.8.6" +proptest = "1" diff --git a/crates/improvise-formula/src/parser.rs b/crates/improvise-formula/src/parser.rs index 0e7238c..51cb101 100644 --- a/crates/improvise-formula/src/parser.rs +++ b/crates/improvise-formula/src/parser.rs @@ -9,6 +9,11 @@ use super::ast::{AggFunc, BinOp, Expr, Filter, Formula}; #[grammar = "formula.pest"] struct FormulaParser; +/// Message used by `.expect()` calls on invariants that the grammar +/// guarantees. If one of these ever panics, the grammar and the tree +/// walker are out of sync — it's a bug, not a runtime condition. +const GRAMMAR_INVARIANT: &str = "grammar invariant violated: parser out of sync with formula.pest"; + /// Parse a formula string like "Profit = Revenue - Cost" /// or "Tax = Revenue * 0.08 WHERE Region = \"East\"" pub fn parse_formula(raw: &str, target_category: &str) -> Result { @@ -16,221 +21,198 @@ pub fn parse_formula(raw: &str, target_category: &str) -> Result { let formula_pair = FormulaParser::parse(Rule::formula, input) .map_err(|e| anyhow!("{}", e))? .next() - .ok_or_else(|| anyhow!("empty parse result"))?; - build_formula(formula_pair, input, target_category) + .expect(GRAMMAR_INVARIANT); + Ok(build_formula(formula_pair, input, target_category)) } /// Parse a bare expression (no target, no top-level WHERE clause). /// Fails if the input contains trailing tokens after a complete expression. pub fn parse_expr(s: &str) -> Result { let input = s.trim(); - let expr_pair = FormulaParser::parse(Rule::expr_eoi, input) + let expr_eoi_pair = FormulaParser::parse(Rule::expr_eoi, input) .map_err(|e| anyhow!("{}", e))? .next() - .ok_or_else(|| anyhow!("empty parse result"))? - .into_inner() - .next() - .ok_or_else(|| anyhow!("missing expression in expr_eoi"))?; - build_expr(expr_pair) + .expect(GRAMMAR_INVARIANT); + Ok(build_expr(first_inner(expr_eoi_pair))) } // ---- tree walkers ------------------------------------------------------- +// +// Every `build_*` function below operates on a pest Pair that has already +// been validated by the grammar. Invariants like "a `formula` has exactly +// one `target`" or "a `comparison` has an lhs, op, and rhs" are guaranteed +// by pest before the tree walker sees the Pair, so these functions are +// infallible. Any `.expect(GRAMMAR_INVARIANT)` in here represents a bug +// marker — if it ever fires, the grammar and the walker have diverged. -fn build_formula(pair: Pair, raw: &str, target_category: &str) -> Result { +fn build_formula(pair: Pair, raw: &str, target_category: &str) -> Formula { let mut target = None; let mut expr = None; let mut filter = None; for inner in pair.into_inner() { - match inner.as_rule() { - Rule::target => target = Some(inner.as_str().trim().to_string()), - Rule::expr => expr = Some(build_expr(inner)?), - Rule::where_clause => filter = Some(build_filter(inner)?), - Rule::EOI => {} - r => return Err(anyhow!("unexpected rule in formula: {:?}", r)), + let rule = inner.as_rule(); + if rule == Rule::target { + target = Some(inner.as_str().trim().to_string()); + } else if rule == Rule::expr { + expr = Some(build_expr(inner)); + } else if rule == Rule::where_clause { + filter = Some(build_filter(inner)); } + // Rule::EOI and any silent rules are ignored. } - let target = target.ok_or_else(|| anyhow!("missing target in formula"))?; - let expr = expr.ok_or_else(|| anyhow!("missing expression in formula"))?; - Ok(Formula::new(raw, target, target_category, expr, filter)) + Formula::new( + raw, + target.expect(GRAMMAR_INVARIANT), + target_category, + expr.expect(GRAMMAR_INVARIANT), + filter, + ) } -fn build_expr(pair: Pair) -> Result { +fn build_expr(pair: Pair) -> Expr { // expr = { add_expr } - build_add_expr(first_inner(pair, "expr")?) + build_add_expr(first_inner(pair)) } -fn build_add_expr(pair: Pair) -> Result { - fold_left_binop(pair, build_mul_expr, |s| match s { - "+" => Some(BinOp::Add), - "-" => Some(BinOp::Sub), - _ => None, +fn build_add_expr(pair: Pair) -> Expr { + fold_left_binop(pair, build_mul_expr, |s| { + if s == "+" { + BinOp::Add + } else { + // The grammar restricts add_op to "+" | "-". + BinOp::Sub + } }) } -fn build_mul_expr(pair: Pair) -> Result { - fold_left_binop(pair, build_pow_expr, |s| match s { - "*" => Some(BinOp::Mul), - "/" => Some(BinOp::Div), - _ => None, +fn build_mul_expr(pair: Pair) -> Expr { + fold_left_binop(pair, build_pow_expr, |s| { + if s == "*" { + BinOp::Mul + } else { + // The grammar restricts mul_op to "*" | "/". + BinOp::Div + } }) } -fn build_pow_expr(pair: Pair) -> Result { +fn build_pow_expr(pair: Pair) -> Expr { // pow_expr = { unary ~ (pow_op ~ unary)? } let mut pairs = pair.into_inner(); - let base_pair = pairs - .next() - .ok_or_else(|| anyhow!("empty pow_expr"))?; - let base = build_unary(base_pair)?; + let base = build_unary(pairs.next().expect(GRAMMAR_INVARIANT)); match pairs.next() { - None => Ok(base), - Some(op_pair) => { - debug_assert_eq!(op_pair.as_rule(), Rule::pow_op); - let exp_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing exponent in pow_expr"))?; - let exp = build_unary(exp_pair)?; - Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp))) + None => base, + Some(_pow_op) => { + let exp = build_unary(pairs.next().expect(GRAMMAR_INVARIANT)); + Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)) } } } -fn build_unary(pair: Pair) -> Result { +fn build_unary(pair: Pair) -> Expr { // unary = { unary_minus | primary } - let inner = first_inner(pair, "unary")?; - match inner.as_rule() { - Rule::unary_minus => { - let prim = first_inner(inner, "unary_minus")?; - Ok(Expr::UnaryMinus(Box::new(build_primary(prim)?))) - } - Rule::primary => build_primary(inner), - r => Err(anyhow!("unexpected rule in unary: {:?}", r)), + let inner = first_inner(pair); + if inner.as_rule() == Rule::unary_minus { + Expr::UnaryMinus(Box::new(build_primary(first_inner(inner)))) + } else { + // primary is the only other alternative. + build_primary(inner) } } -fn build_primary(pair: Pair) -> Result { +fn build_primary(pair: Pair) -> Expr { // primary = { number | agg_call | if_expr | paren_expr | ref_expr } - let inner = first_inner(pair, "primary")?; - match inner.as_rule() { - Rule::number => Ok(Expr::Number(inner.as_str().parse()?)), - Rule::agg_call => build_agg_call(inner), - Rule::if_expr => build_if_expr(inner), - Rule::paren_expr => build_expr(first_inner(inner, "paren_expr")?), - Rule::ref_expr => { - let id_pair = first_inner(inner, "ref_expr")?; - Ok(Expr::Ref(identifier_to_string(id_pair))) - } - r => Err(anyhow!("unexpected rule in primary: {:?}", r)), + let inner = first_inner(pair); + let rule = inner.as_rule(); + if rule == Rule::number { + Expr::Number(inner.as_str().parse().expect(GRAMMAR_INVARIANT)) + } else if rule == Rule::agg_call { + build_agg_call(inner) + } else if rule == Rule::if_expr { + build_if_expr(inner) + } else if rule == Rule::paren_expr { + build_expr(first_inner(inner)) + } else { + // ref_expr is the only remaining alternative. + Expr::Ref(identifier_to_string(first_inner(inner))) } } -fn build_agg_call(pair: Pair) -> Result { +fn build_agg_call(pair: Pair) -> Expr { // agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" } let mut pairs = pair.into_inner(); - let func_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing agg_func"))?; - let func = parse_agg_func(func_pair.as_str())?; - let expr_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing aggregate argument"))?; - let inner_expr = build_expr(expr_pair)?; - let filter = match pairs.next() { - Some(p) if p.as_rule() == Rule::inline_where => Some(build_filter(p)?), - _ => None, - }; - Ok(Expr::Agg(func, Box::new(inner_expr), filter)) + let func = parse_agg_func(pairs.next().expect(GRAMMAR_INVARIANT).as_str()); + let inner_expr = build_expr(pairs.next().expect(GRAMMAR_INVARIANT)); + // The only pair after expr (if any) is `inline_where`, so we can + // map it directly without checking the rule variant. + let filter = pairs.next().map(build_filter); + Expr::Agg(func, Box::new(inner_expr), filter) } -fn parse_agg_func(s: &str) -> Result { +fn parse_agg_func(s: &str) -> AggFunc { + // agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" } match s.to_ascii_uppercase().as_str() { - "SUM" => Ok(AggFunc::Sum), - "AVG" => Ok(AggFunc::Avg), - "MIN" => Ok(AggFunc::Min), - "MAX" => Ok(AggFunc::Max), - "COUNT" => Ok(AggFunc::Count), - f => Err(anyhow!("unknown aggregate function: {}", f)), + "SUM" => AggFunc::Sum, + "AVG" => AggFunc::Avg, + "MIN" => AggFunc::Min, + "MAX" => AggFunc::Max, + // COUNT is the only remaining alternative. + _ => AggFunc::Count, } } -fn build_if_expr(pair: Pair) -> Result { +fn build_if_expr(pair: Pair) -> Expr { // if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" } let mut pairs = pair.into_inner(); - let cond_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing IF condition"))?; - let cond = build_comparison(cond_pair)?; - let then_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing IF then-branch"))?; - let then_e = build_expr(then_pair)?; - let else_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing IF else-branch"))?; - let else_e = build_expr(else_pair)?; - Ok(Expr::If( - Box::new(cond), - Box::new(then_e), - Box::new(else_e), - )) + let cond = build_comparison(pairs.next().expect(GRAMMAR_INVARIANT)); + let then_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT)); + let else_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT)); + Expr::If(Box::new(cond), Box::new(then_e), Box::new(else_e)) } -fn build_comparison(pair: Pair) -> Result { +fn build_comparison(pair: Pair) -> Expr { // comparison = { expr ~ cmp_op ~ expr } let mut pairs = pair.into_inner(); - let lhs_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing comparison lhs"))?; - let lhs = build_expr(lhs_pair)?; - let op_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing comparison operator"))?; - let op = parse_cmp_op(op_pair.as_str())?; - let rhs_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing comparison rhs"))?; - let rhs = build_expr(rhs_pair)?; - Ok(Expr::BinOp(op, Box::new(lhs), Box::new(rhs))) + let lhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT)); + let op = parse_cmp_op(pairs.next().expect(GRAMMAR_INVARIANT).as_str()); + let rhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT)); + Expr::BinOp(op, Box::new(lhs), Box::new(rhs)) } -fn parse_cmp_op(s: &str) -> Result { +fn parse_cmp_op(s: &str) -> BinOp { + // cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" } match s { - "=" => Ok(BinOp::Eq), - "!=" => Ok(BinOp::Ne), - "<" => Ok(BinOp::Lt), - ">" => Ok(BinOp::Gt), - "<=" => Ok(BinOp::Le), - ">=" => Ok(BinOp::Ge), - o => Err(anyhow!("unknown comparison operator: {}", o)), + "=" => BinOp::Eq, + "!=" => BinOp::Ne, + "<" => BinOp::Lt, + ">" => BinOp::Gt, + "<=" => BinOp::Le, + // ">=" is the only remaining alternative. + _ => BinOp::Ge, } } -fn build_filter(pair: Pair) -> Result { +fn build_filter(pair: Pair) -> Filter { // where_clause / inline_where both have shape: // ^"WHERE" ~ identifier ~ "=" ~ filter_value let mut pairs = pair.into_inner(); - let cat_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing WHERE category"))?; - let category = identifier_to_string(cat_pair); - let val_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing WHERE value"))?; - let item = filter_value_to_string(val_pair); - Ok(Filter { category, item }) + let category = identifier_to_string(pairs.next().expect(GRAMMAR_INVARIANT)); + let item = filter_value_to_string(pairs.next().expect(GRAMMAR_INVARIANT)); + Filter { category, item } } fn filter_value_to_string(pair: Pair) -> String { // filter_value = { string | pipe_quoted | bare_ident } - let inner = pair - .into_inner() - .next() - .expect("filter_value must have an inner pair"); + let inner = first_inner(pair); let s = inner.as_str(); - match inner.as_rule() { - Rule::string => strip_string_quotes(s), - Rule::pipe_quoted => unquote_pipe(s), - _ => s.to_string(), + let rule = inner.as_rule(); + if rule == Rule::string { + strip_string_quotes(s) + } else if rule == Rule::pipe_quoted { + unquote_pipe(s) + } else { + // bare_ident is the only remaining alternative. + s.to_string() } } @@ -257,8 +239,8 @@ fn strip_string_quotes(s: &str) -> String { } /// Strip surrounding pipes and apply backslash escapes: `\|` → `|`, -/// `\\` → `\`, `\n` → newline. Matches the escape semantics documented -/// in src/persistence/improv.pest. +/// `\\` → `\`, `\n` → newline, and any other `\X` is preserved verbatim. +/// Matches the escape semantics documented in src/persistence/improv.pest. fn unquote_pipe(s: &str) -> String { debug_assert!(is_pipe_quoted(s)); let inner = &s[1..s.len() - 1]; @@ -266,15 +248,18 @@ fn unquote_pipe(s: &str) -> String { let mut chars = inner.chars(); while let Some(c) = chars.next() { if c == '\\' { - match chars.next() { - Some('|') => out.push('|'), - Some('\\') => out.push('\\'), - Some('n') => out.push('\n'), - Some(other) => { + // The grammar rule `"\\" ~ ANY` guarantees that a backslash + // inside a pipe-quoted identifier is always followed by another + // character, so `chars.next()` cannot be `None` here. + let escaped = chars.next().expect(GRAMMAR_INVARIANT); + match escaped { + '|' => out.push('|'), + '\\' => out.push('\\'), + 'n' => out.push('\n'), + other => { out.push('\\'); out.push(other); } - None => out.push('\\'), } } else { out.push(c); @@ -285,35 +270,27 @@ fn unquote_pipe(s: &str) -> String { // ---- small helpers ------------------------------------------------------ -fn first_inner<'a>(pair: Pair<'a, Rule>, ctx: &str) -> Result> { - pair.into_inner() - .next() - .ok_or_else(|| anyhow!("empty rule: {}", ctx)) +fn first_inner(pair: Pair<'_, Rule>) -> Pair<'_, Rule> { + pair.into_inner().next().expect(GRAMMAR_INVARIANT) } /// Fold a left-associative binary-operator rule of the shape /// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree. -fn fold_left_binop(pair: Pair, mut build_child: F, match_op: M) -> Result +/// The `match_op` closure is infallible — the grammar guarantees the +/// operator token is one of the expected alternatives. +fn fold_left_binop(pair: Pair, mut build_child: F, match_op: M) -> Expr where - F: FnMut(Pair) -> Result, - M: Fn(&str) -> Option, + F: FnMut(Pair) -> Expr, + M: Fn(&str) -> BinOp, { let mut pairs = pair.into_inner(); - let first = pairs - .next() - .ok_or_else(|| anyhow!("empty binop rule"))?; - let mut left = build_child(first)?; + let mut left = build_child(pairs.next().expect(GRAMMAR_INVARIANT)); while let Some(op_pair) = pairs.next() { - let op = match_op(op_pair.as_str()).ok_or_else(|| { - anyhow!("unexpected operator token: {:?}", op_pair.as_str()) - })?; - let right_pair = pairs - .next() - .ok_or_else(|| anyhow!("missing rhs for operator"))?; - let right = build_child(right_pair)?; + let op = match_op(op_pair.as_str()); + let right = build_child(pairs.next().expect(GRAMMAR_INVARIANT)); left = Expr::BinOp(op, Box::new(left), Box::new(right)); } - Ok(left) + left } #[cfg(test)] @@ -636,4 +613,458 @@ mod tests { panic!("Expected Ref, got: {:?}", f.expr); } } + + #[test] + fn pipe_quoted_escape_double_backslash() { + // \\ inside a pipe-quoted identifier is a literal backslash + let f = parse_formula("X = |A\\\\B|", "Cat").unwrap(); + if let Expr::Ref(ref s) = f.expr { + assert_eq!(s, "A\\B"); + } else { + panic!("Expected Ref, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_escape_newline() { + // \n inside a pipe-quoted identifier is a literal newline + let f = parse_formula("X = |A\\nB|", "Cat").unwrap(); + if let Expr::Ref(ref s) = f.expr { + assert_eq!(s, "A\nB"); + } else { + panic!("Expected Ref, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_unknown_escape_preserved() { + // Any `\X` where X isn't |, \, or n is preserved verbatim as + // backslash-plus-character. The grammar's `"\\" ~ ANY` allows + // any following character; we just don't interpret it. + let f = parse_formula("X = |A\\zB|", "Cat").unwrap(); + if let Expr::Ref(ref s) = f.expr { + assert_eq!(s, "A\\zB"); + } else { + panic!("Expected Ref, got: {:?}", f.expr); + } + } + + // ── Operator precedence and associativity ────────────────────────── + + #[test] + fn mul_binds_tighter_than_add() { + // `A + B * C` must parse as Add(A, Mul(B, C)) + let f = parse_formula("X = A + B * C", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::Ref(_))); + assert!(matches!(**rhs, Expr::BinOp(BinOp::Mul, _, _))); + } else { + panic!("Expected Add with Mul on rhs, got: {:?}", f.expr); + } + } + + #[test] + fn pow_binds_tighter_than_mul() { + // `A * B ^ C` must parse as Mul(A, Pow(B, C)) + let f = parse_formula("X = A * B ^ C", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Mul, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::Ref(_))); + assert!(matches!(**rhs, Expr::BinOp(BinOp::Pow, _, _))); + } else { + panic!("Expected Mul with Pow on rhs, got: {:?}", f.expr); + } + } + + #[test] + fn subtraction_is_left_associative() { + // `A - B - C` must parse as Sub(Sub(A, B), C), not Sub(A, Sub(B, C)) + let f = parse_formula("X = A - B - C", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Sub, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::BinOp(BinOp::Sub, _, _))); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "C")); + } else { + panic!("Expected left-associative Sub, got: {:?}", f.expr); + } + } + + #[test] + fn division_is_left_associative() { + // `A / B / C` must parse as Div(Div(A, B), C) + let f = parse_formula("X = A / B / C", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Div, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::BinOp(BinOp::Div, _, _))); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "C")); + } else { + panic!("Expected left-associative Div, got: {:?}", f.expr); + } + } + + #[test] + fn unary_minus_before_pow() { + // `-A ^ B` — hand-rolled parser parses this as UnaryMinus(Ref(A)) + // followed by nothing because pow is inside unary's recursion. + // My grammar: unary = unary_minus | primary, unary_minus = "-" ~ primary. + // So `-A` is UnaryMinus(Ref(A)); `^B` can't attach because pow_expr + // sits OUTSIDE unary. Result: pow_expr(unary_minus(A)) — no pow + // because the optional pow_op doesn't have another unary to bind to. + // Verify: `-A ^ B` parses to BinOp(Pow, UnaryMinus(A), B). + let f = parse_formula("X = -A ^ B", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Pow, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::UnaryMinus(_))); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "B")); + } else { + panic!("Expected Pow(UnaryMinus, Ref), got: {:?}", f.expr); + } + } + + // ── Number literal variants ──────────────────────────────────────── + + #[test] + fn integer_literal() { + let f = parse_formula("X = 42", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10)); + } + + #[test] + fn zero_literal() { + let f = parse_formula("X = 0", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Number(n) if n == 0.0)); + } + + #[test] + fn decimal_literal_without_integer_part() { + let f = parse_formula("X = .5", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Number(n) if (n - 0.5).abs() < 1e-10)); + } + + #[test] + fn decimal_literal_with_trailing_dot() { + let f = parse_formula("X = 5.", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Number(n) if (n - 5.0).abs() < 1e-10)); + } + + #[test] + fn decimal_literal_with_integer_and_fraction() { + let f = parse_formula("X = 123.456", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Number(n) if (n - 123.456).abs() < 1e-10)); + } + + // ── Filter value variants ────────────────────────────────────────── + + #[test] + fn where_with_bare_identifier_value() { + // filter_value = { string | pipe_quoted | bare_ident } — exercise + // the bare_ident branch. + let f = parse_formula("X = Revenue WHERE Region = East", "Cat").unwrap(); + let filter = f.filter.as_ref().unwrap(); + assert_eq!(filter.category, "Region"); + assert_eq!(filter.item, "East"); + } + + // ── Nested constructs ────────────────────────────────────────────── + + #[test] + fn nested_sum_aggregate() { + // Nested aggregates — outer SUM wrapping an inner SUM. + let f = parse_formula("X = SUM(SUM(Revenue))", "Cat").unwrap(); + if let Expr::Agg(AggFunc::Sum, outer_inner, None) = &f.expr { + assert!(matches!(**outer_inner, Expr::Agg(AggFunc::Sum, _, None))); + } else { + panic!("Expected nested SUM aggregates, got: {:?}", f.expr); + } + } + + #[test] + fn deeply_nested_parens() { + // Parens should flatten away without affecting the AST. + let f = parse_formula("X = (((((A + B)))))", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _))); + } + + #[test] + fn nested_if_expression() { + // IF in the then-branch of another IF. + let f = parse_formula("X = IF(A > B, IF(C > D, 1, 2), 3)", "Cat").unwrap(); + if let Expr::If(_, then_e, else_e) = &f.expr { + assert!(matches!(**then_e, Expr::If(_, _, _))); + assert!(matches!(**else_e, Expr::Number(n) if n == 3.0)); + } else { + panic!("Expected outer IF, got: {:?}", f.expr); + } + } + + // ── Whitespace tolerance ─────────────────────────────────────────── + + #[test] + fn tolerates_tabs_between_tokens() { + let f = parse_formula("X\t=\tA\t+\tB", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _))); + } + + #[test] + fn tolerates_extra_spaces_between_tokens() { + let f = parse_formula("X = A + B", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _))); + } + + #[test] + fn tolerates_leading_and_trailing_whitespace() { + let f = parse_formula(" X = A + B ", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _))); + } + + // ── Case insensitivity of keywords ───────────────────────────────── + + #[test] + fn aggregate_function_is_case_insensitive() { + // The grammar uses ^"SUM" which is case-insensitive. + let f = parse_formula("X = sum(Revenue)", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _))); + + let f = parse_formula("X = Sum(Revenue)", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _))); + } + + #[test] + fn if_keyword_is_case_insensitive() { + let f = parse_formula("X = if(A > B, 1, 0)", "Cat").unwrap(); + assert!(matches!(f.expr, Expr::If(_, _, _))); + } + + #[test] + fn where_keyword_is_case_insensitive() { + let f = parse_formula("X = Revenue where Region = \"East\"", "Cat").unwrap(); + assert!(f.filter.is_some()); + } + + // ── Target variants ──────────────────────────────────────────────── + + #[test] + fn target_with_underscore_and_hyphen() { + let f = parse_formula("my_target-name = A", "Cat").unwrap(); + assert_eq!(f.target, "my_target-name"); + } + + #[test] + fn pipe_quoted_target_preserves_pipes() { + let f = parse_formula("|My Target| = A", "Cat").unwrap(); + assert_eq!(f.target, "|My Target|"); + } +} + +// ---- grammar-driven proptests ------------------------------------------ +// +// Uses pest_meta to read formula.pest at test time and walks the grammar +// AST to generate random valid formulas. Mirrors the pattern used by the +// persistence parser tests in src/persistence/mod.rs. + +#[cfg(test)] +mod generator { + use pest_meta::ast::{Expr, RuleType}; + use pest_meta::parser; + use proptest::prelude::*; + use std::collections::HashMap; + + /// Parse formula.pest and return rules keyed by name. + fn load_grammar() -> HashMap { + let grammar = include_str!("formula.pest"); + let pairs = parser::parse(parser::Rule::grammar_rules, grammar) + .unwrap_or_else(|e| panic!("Bad grammar: {e}")); + let rules = parser::consume_rules(pairs).unwrap_or_else(|e| panic!("{e:?}")); + rules + .into_iter() + .map(|r| (r.name.clone(), (r.ty, r.expr))) + .collect() + } + + /// Recursive string generator driven by a pest `Expr`. `choices` is + /// consumed left-to-right at every decision point; the `atomic` flag + /// controls whether sequences insert a space between their children, + /// so that non-atomic grammar rules produce whitespace-separated + /// tokens that pest's implicit WHITESPACE handling accepts. + pub struct Gen<'g> { + rules: &'g HashMap, + choices: Vec, + pos: usize, + } + + impl<'g> Gen<'g> { + pub fn new(rules: &'g HashMap, choices: Vec) -> Self { + Self { + rules, + choices, + pos: 0, + } + } + + fn pick(&mut self) -> u8 { + let v = self.choices.get(self.pos).copied().unwrap_or(0); + self.pos += 1; + v + } + + fn emit(&mut self, expr: &Expr, out: &mut String, atomic: bool) { + match expr { + Expr::Str(s) => out.push_str(s), + Expr::Insens(s) => out.push_str(s), + Expr::Range(lo, hi) => { + let lo = lo.chars().next().unwrap() as u32; + let hi = hi.chars().next().unwrap() as u32; + let range = hi - lo + 1; + let ch = char::from_u32(lo + (self.pick() as u32 % range)).unwrap(); + out.push(ch); + } + Expr::Ident(name) => self.emit_ident(name, out, atomic), + Expr::Seq(a, b) => { + self.emit(a, out, atomic); + if !atomic { + out.push(' '); + } + self.emit(b, out, atomic); + } + Expr::Choice(a, b) => { + let mut alts: Vec<&Expr> = vec![a.as_ref()]; + let mut cur = b.as_ref(); + while let Expr::Choice(l, r) = cur { + alts.push(l.as_ref()); + cur = r.as_ref(); + } + alts.push(cur); + let idx = self.pick() as usize % alts.len(); + self.emit(alts[idx], out, atomic); + } + Expr::Opt(inner) => { + // ~66% chance of emitting + if !self.pick().is_multiple_of(3) { + self.emit(inner, out, atomic); + } + } + Expr::Rep(inner) => { + let count = self.pick() % 4; + for i in 0..count { + if i > 0 && !atomic { + out.push(' '); + } + self.emit(inner, out, atomic); + } + } + Expr::RepOnce(inner) => { + let count = 1 + self.pick() % 3; + for i in 0..count { + if i > 0 && !atomic { + out.push(' '); + } + self.emit(inner, out, atomic); + } + } + Expr::NegPred(_) | Expr::PosPred(_) => { + // Lookaheads don't emit output. + } + _ => { + // Skip unsupported expressions. + } + } + } + + fn emit_ident(&mut self, name: &str, out: &mut String, atomic: bool) { + match name { + "ANY" | "ASCII_ALPHA" => { + out.push((b'a' + self.pick() % 26) as char); + } + "ASCII_ALPHANUMERIC" => { + if self.pick().is_multiple_of(2) { + out.push((b'a' + self.pick() % 26) as char); + } else { + out.push((b'0' + self.pick() % 10) as char); + } + } + "ASCII_DIGIT" => { + out.push((b'0' + self.pick() % 10) as char); + } + "NEWLINE" => out.push('\n'), + "SOI" | "EOI" => {} + _ => { + if let Some((ty, inner)) = self.rules.get(name).cloned() { + let inner_atomic = atomic + || matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic); + self.emit(&inner, out, inner_atomic); + } + } + } + } + + pub fn generate(&mut self, rule_name: &str) -> String { + let mut out = String::new(); + if let Some((ty, expr)) = self.rules.get(rule_name).cloned() { + let atomic = matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic); + self.emit(&expr, &mut out, atomic); + } + out + } + } + + /// Proptest strategy: generate a random valid formula by walking the + /// grammar's `formula` rule. + pub fn formula_string() -> impl Strategy { + prop::collection::vec(any::(), 32..=128).prop_map(|choices| { + let rules = load_grammar(); + Gen::new(&rules, choices).generate("formula") + }) + } + + /// Proptest strategy: generate a random valid standalone expression. + pub fn expr_string() -> impl Strategy { + prop::collection::vec(any::(), 32..=128).prop_map(|choices| { + let rules = load_grammar(); + Gen::new(&rules, choices).generate("expr_eoi") + }) + } +} + +#[cfg(test)] +mod grammar_prop_tests { + use super::generator; + use super::{parse_expr, parse_formula}; + use proptest::prelude::*; + + proptest! { + #![proptest_config(ProptestConfig::with_cases(256))] + + /// Every generator-produced formula parses without error. + #[test] + fn generated_formula_parses(formula in generator::formula_string()) { + let result = parse_formula(&formula, "Cat"); + prop_assert!( + result.is_ok(), + "Generated formula failed to parse:\n{}\nError: {}", + formula, + result.unwrap_err() + ); + } + + /// Every generator-produced standalone expression parses. + #[test] + fn generated_expr_parses(expr in generator::expr_string()) { + let result = parse_expr(&expr); + prop_assert!( + result.is_ok(), + "Generated expression failed to parse:\n{}\nError: {}", + expr, + result.unwrap_err() + ); + } + + /// Parsing the same input twice is deterministic — the debug + /// representation of the resulting AST is identical. + #[test] + fn parse_is_deterministic(formula in generator::formula_string()) { + let r1 = parse_formula(&formula, "Cat"); + let r2 = parse_formula(&formula, "Cat"); + prop_assume!(r1.is_ok() && r2.is_ok()); + let f1 = r1.unwrap(); + let f2 = r2.unwrap(); + prop_assert_eq!(&f1.target, &f2.target); + prop_assert_eq!(format!("{:?}", f1.expr), format!("{:?}", f2.expr)); + } + } }