Refactor formula parser tests to use more concise assert!(matches!(...)) syntax. Simplify the formula generator implementation by removing unused expression variants and using expect() for mandatory grammar rules. Add a regression test for hyphenated identifiers in bare names. Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-26B-A4B-it-UD-Q5_K_XL.gguf)
1081 lines
40 KiB
Rust
1081 lines
40 KiB
Rust
use anyhow::{Result, anyhow};
|
|
use pest::Parser as _;
|
|
use pest::iterators::Pair;
|
|
use pest_derive::Parser;
|
|
|
|
use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
|
|
|
|
#[derive(Parser)]
|
|
#[grammar = "formula.pest"]
|
|
struct FormulaParser;
|
|
|
|
/// Message used by `.expect()` calls on invariants that the grammar
|
|
/// guarantees. If one of these ever panics, the grammar and the tree
|
|
/// walker are out of sync — it's a bug, not a runtime condition.
|
|
const GRAMMAR_INVARIANT: &str = "grammar invariant violated: parser out of sync with formula.pest";
|
|
|
|
/// Parse a formula string like "Profit = Revenue - Cost"
|
|
/// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
|
|
pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
|
|
let input = raw.trim();
|
|
let formula_pair = FormulaParser::parse(Rule::formula, input)
|
|
.map_err(|e| anyhow!("{}", e))?
|
|
.next()
|
|
.expect(GRAMMAR_INVARIANT);
|
|
Ok(build_formula(formula_pair, input, target_category))
|
|
}
|
|
|
|
/// Parse a bare expression (no target, no top-level WHERE clause).
|
|
/// Fails if the input contains trailing tokens after a complete expression.
|
|
pub fn parse_expr(s: &str) -> Result<Expr> {
|
|
let input = s.trim();
|
|
let expr_eoi_pair = FormulaParser::parse(Rule::expr_eoi, input)
|
|
.map_err(|e| anyhow!("{}", e))?
|
|
.next()
|
|
.expect(GRAMMAR_INVARIANT);
|
|
Ok(build_expr(first_inner(expr_eoi_pair)))
|
|
}
|
|
|
|
// ---- tree walkers -------------------------------------------------------
|
|
//
|
|
// Every `build_*` function below operates on a pest Pair that has already
|
|
// been validated by the grammar. Invariants like "a `formula` has exactly
|
|
// one `target`" or "a `comparison` has an lhs, op, and rhs" are guaranteed
|
|
// by pest before the tree walker sees the Pair, so these functions are
|
|
// infallible. Any `.expect(GRAMMAR_INVARIANT)` in here represents a bug
|
|
// marker — if it ever fires, the grammar and the walker have diverged.
|
|
|
|
fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Formula {
|
|
let mut target = None;
|
|
let mut expr = None;
|
|
let mut filter = None;
|
|
for inner in pair.into_inner() {
|
|
let rule = inner.as_rule();
|
|
if rule == Rule::target {
|
|
target = Some(inner.as_str().trim().to_string());
|
|
} else if rule == Rule::expr {
|
|
expr = Some(build_expr(inner));
|
|
} else if rule == Rule::where_clause {
|
|
filter = Some(build_filter(inner));
|
|
}
|
|
// Rule::EOI and any silent rules are ignored.
|
|
}
|
|
Formula::new(
|
|
raw,
|
|
target.expect(GRAMMAR_INVARIANT),
|
|
target_category,
|
|
expr.expect(GRAMMAR_INVARIANT),
|
|
filter,
|
|
)
|
|
}
|
|
|
|
fn build_expr(pair: Pair<Rule>) -> Expr {
|
|
// expr = { add_expr }
|
|
build_add_expr(first_inner(pair))
|
|
}
|
|
|
|
fn build_add_expr(pair: Pair<Rule>) -> Expr {
|
|
fold_left_binop(pair, build_mul_expr, |s| {
|
|
if s == "+" {
|
|
BinOp::Add
|
|
} else {
|
|
// The grammar restricts add_op to "+" | "-".
|
|
BinOp::Sub
|
|
}
|
|
})
|
|
}
|
|
|
|
fn build_mul_expr(pair: Pair<Rule>) -> Expr {
|
|
fold_left_binop(pair, build_pow_expr, |s| {
|
|
if s == "*" {
|
|
BinOp::Mul
|
|
} else {
|
|
// The grammar restricts mul_op to "*" | "/".
|
|
BinOp::Div
|
|
}
|
|
})
|
|
}
|
|
|
|
fn build_pow_expr(pair: Pair<Rule>) -> Expr {
|
|
// pow_expr = { unary ~ (pow_op ~ unary)? }
|
|
let mut pairs = pair.into_inner();
|
|
let base = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
match pairs.next() {
|
|
None => base,
|
|
Some(_pow_op) => {
|
|
let exp = build_unary(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp))
|
|
}
|
|
}
|
|
}
|
|
|
|
fn build_unary(pair: Pair<Rule>) -> Expr {
|
|
// unary = { unary_minus | primary }
|
|
let inner = first_inner(pair);
|
|
if inner.as_rule() == Rule::unary_minus {
|
|
Expr::UnaryMinus(Box::new(build_primary(first_inner(inner))))
|
|
} else {
|
|
// primary is the only other alternative.
|
|
build_primary(inner)
|
|
}
|
|
}
|
|
|
|
fn build_primary(pair: Pair<Rule>) -> Expr {
|
|
// primary = { number | agg_call | if_expr | paren_expr | ref_expr }
|
|
let inner = first_inner(pair);
|
|
let rule = inner.as_rule();
|
|
if rule == Rule::number {
|
|
Expr::Number(inner.as_str().parse().expect(GRAMMAR_INVARIANT))
|
|
} else if rule == Rule::agg_call {
|
|
build_agg_call(inner)
|
|
} else if rule == Rule::if_expr {
|
|
build_if_expr(inner)
|
|
} else if rule == Rule::paren_expr {
|
|
build_expr(first_inner(inner))
|
|
} else {
|
|
// ref_expr is the only remaining alternative.
|
|
Expr::Ref(identifier_to_string(first_inner(inner)))
|
|
}
|
|
}
|
|
|
|
fn build_agg_call(pair: Pair<Rule>) -> Expr {
|
|
// agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
|
|
let mut pairs = pair.into_inner();
|
|
let func = parse_agg_func(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
|
|
let inner_expr = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
// The only pair after expr (if any) is `inline_where`, so we can
|
|
// map it directly without checking the rule variant.
|
|
let filter = pairs.next().map(build_filter);
|
|
Expr::Agg(func, Box::new(inner_expr), filter)
|
|
}
|
|
|
|
fn parse_agg_func(s: &str) -> AggFunc {
|
|
// agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
|
|
match s.to_ascii_uppercase().as_str() {
|
|
"SUM" => AggFunc::Sum,
|
|
"AVG" => AggFunc::Avg,
|
|
"MIN" => AggFunc::Min,
|
|
"MAX" => AggFunc::Max,
|
|
// COUNT is the only remaining alternative.
|
|
_ => AggFunc::Count,
|
|
}
|
|
}
|
|
|
|
fn build_if_expr(pair: Pair<Rule>) -> Expr {
|
|
// if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
|
|
let mut pairs = pair.into_inner();
|
|
let cond = build_comparison(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
let then_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
let else_e = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
Expr::If(Box::new(cond), Box::new(then_e), Box::new(else_e))
|
|
}
|
|
|
|
fn build_comparison(pair: Pair<Rule>) -> Expr {
|
|
// comparison = { expr ~ cmp_op ~ expr }
|
|
let mut pairs = pair.into_inner();
|
|
let lhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
let op = parse_cmp_op(pairs.next().expect(GRAMMAR_INVARIANT).as_str());
|
|
let rhs = build_expr(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
Expr::BinOp(op, Box::new(lhs), Box::new(rhs))
|
|
}
|
|
|
|
fn parse_cmp_op(s: &str) -> BinOp {
|
|
// cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
|
|
match s {
|
|
"=" => BinOp::Eq,
|
|
"!=" => BinOp::Ne,
|
|
"<" => BinOp::Lt,
|
|
">" => BinOp::Gt,
|
|
"<=" => BinOp::Le,
|
|
// ">=" is the only remaining alternative.
|
|
_ => BinOp::Ge,
|
|
}
|
|
}
|
|
|
|
fn build_filter(pair: Pair<Rule>) -> Filter {
|
|
// where_clause / inline_where both have shape:
|
|
// ^"WHERE" ~ identifier ~ "=" ~ filter_value
|
|
let mut pairs = pair.into_inner();
|
|
let category = identifier_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
let item = filter_value_to_string(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
Filter { category, item }
|
|
}
|
|
|
|
fn filter_value_to_string(pair: Pair<Rule>) -> String {
|
|
// filter_value = { string | pipe_quoted | bare_ident }
|
|
let inner = first_inner(pair);
|
|
let s = inner.as_str();
|
|
let rule = inner.as_rule();
|
|
if rule == Rule::string {
|
|
strip_string_quotes(s)
|
|
} else if rule == Rule::pipe_quoted {
|
|
unquote_pipe(s)
|
|
} else {
|
|
// bare_ident is the only remaining alternative.
|
|
s.to_string()
|
|
}
|
|
}
|
|
|
|
/// Convert an identifier pair (identifier, pipe_quoted, or bare_ident) to
|
|
/// its content string. Pipe-quoted identifiers have their delimiters
|
|
/// stripped and backslash escapes applied; bare identifiers are returned
|
|
/// verbatim.
|
|
fn identifier_to_string(pair: Pair<Rule>) -> String {
|
|
let s = pair.as_str();
|
|
if is_pipe_quoted(s) {
|
|
unquote_pipe(s)
|
|
} else {
|
|
s.to_string()
|
|
}
|
|
}
|
|
|
|
fn is_pipe_quoted(s: &str) -> bool {
|
|
s.len() >= 2 && s.starts_with('|') && s.ends_with('|')
|
|
}
|
|
|
|
fn strip_string_quotes(s: &str) -> String {
|
|
debug_assert!(s.len() >= 2 && s.starts_with('"') && s.ends_with('"'));
|
|
s[1..s.len() - 1].to_string()
|
|
}
|
|
|
|
/// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
|
|
/// `\\` → `\`, `\n` → newline, and any other `\X` is preserved verbatim.
|
|
/// Matches the escape semantics documented in src/persistence/improv.pest.
|
|
fn unquote_pipe(s: &str) -> String {
|
|
debug_assert!(is_pipe_quoted(s));
|
|
let inner = &s[1..s.len() - 1];
|
|
let mut out = String::with_capacity(inner.len());
|
|
let mut chars = inner.chars();
|
|
while let Some(c) = chars.next() {
|
|
if c == '\\' {
|
|
// The grammar rule `"\\" ~ ANY` guarantees that a backslash
|
|
// inside a pipe-quoted identifier is always followed by another
|
|
// character, so `chars.next()` cannot be `None` here.
|
|
let escaped = chars.next().expect(GRAMMAR_INVARIANT);
|
|
match escaped {
|
|
'|' => out.push('|'),
|
|
'\\' => out.push('\\'),
|
|
'n' => out.push('\n'),
|
|
other => {
|
|
out.push('\\');
|
|
out.push(other);
|
|
}
|
|
}
|
|
} else {
|
|
out.push(c);
|
|
}
|
|
}
|
|
out
|
|
}
|
|
|
|
// ---- small helpers ------------------------------------------------------
|
|
|
|
fn first_inner(pair: Pair<'_, Rule>) -> Pair<'_, Rule> {
|
|
pair.into_inner().next().expect(GRAMMAR_INVARIANT)
|
|
}
|
|
|
|
/// Fold a left-associative binary-operator rule of the shape
|
|
/// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
|
|
/// The `match_op` closure is infallible — the grammar guarantees the
|
|
/// operator token is one of the expected alternatives.
|
|
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Expr
|
|
where
|
|
F: FnMut(Pair<Rule>) -> Expr,
|
|
M: Fn(&str) -> BinOp,
|
|
{
|
|
let mut pairs = pair.into_inner();
|
|
let mut left = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
while let Some(op_pair) = pairs.next() {
|
|
let op = match_op(op_pair.as_str());
|
|
let right = build_child(pairs.next().expect(GRAMMAR_INVARIANT));
|
|
left = Expr::BinOp(op, Box::new(left), Box::new(right));
|
|
}
|
|
left
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::parse_formula;
|
|
use crate::{AggFunc, BinOp, Expr};
|
|
|
|
#[test]
|
|
fn parse_simple_subtraction() {
|
|
let f = parse_formula("Profit = Revenue - Cost", "Foo").unwrap();
|
|
assert_eq!(f.target, "Profit");
|
|
assert_eq!(f.target_category, "Foo");
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Sub, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_where_clause() {
|
|
let f = parse_formula("EastRev = Revenue WHERE Region = \"East\"", "Foo").unwrap();
|
|
assert_eq!(f.target, "EastRev");
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.category, "Region");
|
|
assert_eq!(filter.item, "East");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_sum_aggregation() {
|
|
let f = parse_formula("Total = SUM(Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_avg_aggregation() {
|
|
let f = parse_formula("Avg = AVG(Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Avg, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_if_expression() {
|
|
let f = parse_formula("Capped = IF(Revenue > 1000, 1000, Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_numeric_literal() {
|
|
let f = parse_formula("Fixed = 42", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_chained_arithmetic() {
|
|
parse_formula("X = (A + B) * (C - D)", "Cat").unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn parse_missing_equals_returns_error() {
|
|
assert!(parse_formula("BadFormula Revenue Cost", "Cat").is_err());
|
|
}
|
|
|
|
// ── Aggregate functions ─────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_min_aggregation() {
|
|
let f = parse_formula("Lo = MIN(Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Min, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_max_aggregation() {
|
|
let f = parse_formula("Hi = MAX(Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Max, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_count_aggregation() {
|
|
let f = parse_formula("N = COUNT(Revenue)", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Count, _, _)));
|
|
}
|
|
|
|
// ── Aggregate with WHERE filter ─────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_sum_with_top_level_where_works() {
|
|
let f = parse_formula("EastTotal = SUM(Revenue) WHERE Region = \"East\"", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.category, "Region");
|
|
assert_eq!(filter.item, "East");
|
|
}
|
|
|
|
/// Regression: WHERE inside aggregate parens must parse as the
|
|
/// aggregate's inline filter, not as a top-level WHERE clause.
|
|
#[test]
|
|
fn parse_sum_with_inline_where_filter() {
|
|
let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::Agg(AggFunc::Sum, inner, Some(filter))
|
|
if matches!(**inner, Expr::Ref(_))
|
|
&& filter.category == "Region"
|
|
&& filter.item == "East"
|
|
));
|
|
}
|
|
|
|
// ── Comparison operators ────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_if_with_comparison_operators() {
|
|
let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
|
|
let f = parse_formula("X = IF(A < 10, A, 10)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
|
|
let f = parse_formula("X = IF(A <= 10, A, 10)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
|
|
let f = parse_formula("X = IF(A >= 10, 10, A)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
|
|
let f = parse_formula("X = IF(A = B, 1, 0)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
}
|
|
|
|
// ── Quoted strings in WHERE ─────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_where_with_quoted_string_inside_expression() {
|
|
let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap();
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.item, "West Coast");
|
|
}
|
|
|
|
// ── Power operator ──────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_power_operator() {
|
|
let f = parse_formula("Sq = X ^ 2", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Pow, _, _)));
|
|
}
|
|
|
|
// ── Unary minus ─────────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_unary_minus() {
|
|
let f = parse_formula("Neg = -Revenue", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::UnaryMinus(_)));
|
|
}
|
|
|
|
// ── Division and multiplication ─────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_multiplication() {
|
|
let f = parse_formula("Double = Revenue * 2", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Mul, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_division() {
|
|
let f = parse_formula("Half = Revenue / 2", "Foo").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Div, _, _)));
|
|
}
|
|
|
|
// ── Parenthesized expression ────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_nested_parens() {
|
|
let f = parse_formula("X = ((A + B))", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
|
|
}
|
|
|
|
// ── Aggregate function name used as ref (no parens) ─────────────────
|
|
|
|
#[test]
|
|
fn parse_aggregate_name_without_parens_is_ref() {
|
|
// "SUM" without parens should be treated as a reference, not a function
|
|
let f = parse_formula("X = SUM + 1", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, _) if matches!(**lhs, Expr::Ref(_))
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_if_without_parens_is_ref() {
|
|
// "IF" without parens should be treated as a reference
|
|
let f = parse_formula("X = IF + 1", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, _) if matches!(**lhs, Expr::Ref(_))
|
|
));
|
|
}
|
|
|
|
// ── Quoted string in WHERE ──────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_quoted_string_in_where() {
|
|
let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap();
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.item, "East");
|
|
}
|
|
|
|
// ── Error paths ─────────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn parse_unexpected_token_error() {
|
|
use super::parse_expr;
|
|
// Extra tokens after a valid expression
|
|
assert!(parse_expr("1 + 2 3").is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_unexpected_character_error() {
|
|
use super::parse_expr;
|
|
assert!(parse_expr("@invalid").is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_empty_expression_error() {
|
|
use super::parse_expr;
|
|
assert!(parse_expr("").is_err());
|
|
}
|
|
|
|
// ── Multi-word identifiers must be pipe-quoted ──────────────────────
|
|
|
|
#[test]
|
|
fn multi_word_bare_identifier_is_rejected() {
|
|
// Multi-word identifiers must be pipe-quoted; bare multi-word fails
|
|
// the `bare_name`-compatible grammar rule.
|
|
assert!(parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").is_err());
|
|
}
|
|
|
|
// ── WHERE inside quotes in the expression ───────────────────────────
|
|
|
|
#[test]
|
|
fn where_inside_quotes_is_not_a_keyword() {
|
|
// A filter value containing the literal text "WHERE" is parsed as
|
|
// a string, not as a nested WHERE keyword.
|
|
let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap();
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.item, "WHERE");
|
|
}
|
|
|
|
// ── Pipe-quoted identifiers ─────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn pipe_quoted_identifier_in_expression() {
|
|
let f = parse_formula("|Total Revenue| = |Base Revenue| + Bonus", "Foo").unwrap();
|
|
assert_eq!(f.target, "|Total Revenue|");
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(ref s) if s == "Base Revenue")
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "Bonus")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_keyword_as_identifier() {
|
|
// A category named "WHERE" can be referenced with pipes
|
|
let f = parse_formula("X = |WHERE| + |SUM|", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(ref s) if s == "WHERE")
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "SUM")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_identifier_with_special_chars() {
|
|
// Pipes allow characters that would normally break tokenization
|
|
let f = parse_formula("X = |Revenue (USD)| + |Cost + Tax|", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(ref s) if s == "Revenue (USD)")
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "Cost + Tax")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_in_aggregate() {
|
|
let f = parse_formula("X = SUM(|Net Revenue|)", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::Agg(AggFunc::Sum, inner, None)
|
|
if matches!(**inner, Expr::Ref(ref s) if s == "Net Revenue")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_in_where_filter_value() {
|
|
let f = parse_formula("X = Revenue WHERE Region = |East Coast|", "Foo").unwrap();
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.item, "East Coast");
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_in_inline_where() {
|
|
let f =
|
|
parse_formula("X = SUM(Revenue WHERE |Region Name| = |East Coast|)", "Foo").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::Agg(AggFunc::Sum, _, Some(filter))
|
|
if filter.category == "Region Name" && filter.item == "East Coast"
|
|
));
|
|
}
|
|
|
|
// ── Pipe-quoted escape semantics ────────────────────────────────────
|
|
|
|
#[test]
|
|
fn pipe_quoted_escape_literal_pipe() {
|
|
// \| inside a pipe-quoted identifier is a literal pipe
|
|
let f = parse_formula("X = |A\\|B|", "Cat").unwrap();
|
|
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A|B"));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_escape_double_backslash() {
|
|
// \\ inside a pipe-quoted identifier is a literal backslash
|
|
let f = parse_formula("X = |A\\\\B|", "Cat").unwrap();
|
|
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\\B"));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_escape_newline() {
|
|
// \n inside a pipe-quoted identifier is a literal newline
|
|
let f = parse_formula("X = |A\\nB|", "Cat").unwrap();
|
|
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\nB"));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_unknown_escape_preserved() {
|
|
// Any `\X` where X isn't |, \, or n is preserved verbatim as
|
|
// backslash-plus-character. The grammar's `"\\" ~ ANY` allows
|
|
// any following character; we just don't interpret it.
|
|
let f = parse_formula("X = |A\\zB|", "Cat").unwrap();
|
|
assert!(matches!(&f.expr, Expr::Ref(s) if s == "A\\zB"));
|
|
}
|
|
|
|
// ── Operator precedence and associativity ──────────────────────────
|
|
|
|
#[test]
|
|
fn mul_binds_tighter_than_add() {
|
|
// `A + B * C` must parse as Add(A, Mul(B, C))
|
|
let f = parse_formula("X = A + B * C", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(_))
|
|
&& matches!(**rhs, Expr::BinOp(BinOp::Mul, _, _))
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pow_binds_tighter_than_mul() {
|
|
// `A * B ^ C` must parse as Mul(A, Pow(B, C))
|
|
let f = parse_formula("X = A * B ^ C", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Mul, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(_))
|
|
&& matches!(**rhs, Expr::BinOp(BinOp::Pow, _, _))
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn subtraction_is_left_associative() {
|
|
// `A - B - C` must parse as Sub(Sub(A, B), C), not Sub(A, Sub(B, C))
|
|
let f = parse_formula("X = A - B - C", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Sub, lhs, rhs)
|
|
if matches!(**lhs, Expr::BinOp(BinOp::Sub, _, _))
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "C")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn division_is_left_associative() {
|
|
// `A / B / C` must parse as Div(Div(A, B), C)
|
|
let f = parse_formula("X = A / B / C", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Div, lhs, rhs)
|
|
if matches!(**lhs, Expr::BinOp(BinOp::Div, _, _))
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "C")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn unary_minus_before_pow() {
|
|
// `-A ^ B` must parse as Pow(UnaryMinus(A), B). The `-` binds
|
|
// through `unary_minus = { "-" ~ primary }`, producing a unary
|
|
// node that is then used as the pow_expr's base.
|
|
let f = parse_formula("X = -A ^ B", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Pow, lhs, rhs)
|
|
if matches!(**lhs, Expr::UnaryMinus(_))
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "B")
|
|
));
|
|
}
|
|
|
|
// ── Number literal variants ────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn integer_literal() {
|
|
let f = parse_formula("X = 42", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if (n - 42.0).abs() < 1e-10));
|
|
}
|
|
|
|
#[test]
|
|
fn zero_literal() {
|
|
let f = parse_formula("X = 0", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if n == 0.0));
|
|
}
|
|
|
|
#[test]
|
|
fn decimal_literal_without_integer_part() {
|
|
let f = parse_formula("X = .5", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if (n - 0.5).abs() < 1e-10));
|
|
}
|
|
|
|
#[test]
|
|
fn decimal_literal_with_trailing_dot() {
|
|
let f = parse_formula("X = 5.", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if (n - 5.0).abs() < 1e-10));
|
|
}
|
|
|
|
#[test]
|
|
fn decimal_literal_with_integer_and_fraction() {
|
|
let f = parse_formula("X = 123.456", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Number(n) if (n - 123.456).abs() < 1e-10));
|
|
}
|
|
|
|
// ── Filter value variants ──────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn where_with_bare_identifier_value() {
|
|
// filter_value = { string | pipe_quoted | bare_ident } — exercise
|
|
// the bare_ident branch.
|
|
let f = parse_formula("X = Revenue WHERE Region = East", "Cat").unwrap();
|
|
let filter = f.filter.as_ref().unwrap();
|
|
assert_eq!(filter.category, "Region");
|
|
assert_eq!(filter.item, "East");
|
|
}
|
|
|
|
// ── Nested constructs ──────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn nested_sum_aggregate() {
|
|
// Nested aggregates — outer SUM wrapping an inner SUM.
|
|
let f = parse_formula("X = SUM(SUM(Revenue))", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::Agg(AggFunc::Sum, outer_inner, None)
|
|
if matches!(**outer_inner, Expr::Agg(AggFunc::Sum, _, None))
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn deeply_nested_parens() {
|
|
// Parens should flatten away without affecting the AST.
|
|
let f = parse_formula("X = (((((A + B)))))", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn nested_if_expression() {
|
|
// IF in the then-branch of another IF.
|
|
let f = parse_formula("X = IF(A > B, IF(C > D, 1, 2), 3)", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::If(_, then_e, else_e)
|
|
if matches!(**then_e, Expr::If(_, _, _))
|
|
&& matches!(**else_e, Expr::Number(n) if n == 3.0)
|
|
));
|
|
}
|
|
|
|
// ── Whitespace tolerance ───────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn tolerates_tabs_between_tokens() {
|
|
let f = parse_formula("X\t=\tA\t+\tB", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn tolerates_extra_spaces_between_tokens() {
|
|
let f = parse_formula("X = A + B", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn tolerates_leading_and_trailing_whitespace() {
|
|
let f = parse_formula(" X = A + B ", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::BinOp(BinOp::Add, _, _)));
|
|
}
|
|
|
|
// ── Case insensitivity of keywords ─────────────────────────────────
|
|
|
|
#[test]
|
|
fn aggregate_function_is_case_insensitive() {
|
|
// The grammar uses ^"SUM" which is case-insensitive.
|
|
let f = parse_formula("X = sum(Revenue)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
|
|
|
|
let f = parse_formula("X = Sum(Revenue)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::Agg(AggFunc::Sum, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn if_keyword_is_case_insensitive() {
|
|
let f = parse_formula("X = if(A > B, 1, 0)", "Cat").unwrap();
|
|
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
|
}
|
|
|
|
#[test]
|
|
fn where_keyword_is_case_insensitive() {
|
|
let f = parse_formula("X = Revenue where Region = \"East\"", "Cat").unwrap();
|
|
assert!(f.filter.is_some());
|
|
}
|
|
|
|
// ── Target variants ────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn target_with_underscore_and_hyphen() {
|
|
let f = parse_formula("my_target-name = A", "Cat").unwrap();
|
|
assert_eq!(f.target, "my_target-name");
|
|
}
|
|
|
|
/// Regression: the hand-rolled tokenizer didn't allow `-` inside bare
|
|
/// identifiers, so references to persistence-legal names like
|
|
/// `east-coast` were silently parsed as `east - coast` (a subtraction
|
|
/// of two unrelated Refs). The pest grammar mirrors improv.pest's
|
|
/// `bare_name`, which does allow `-`, so a hyphenated name now parses
|
|
/// as a single identifier.
|
|
#[test]
|
|
fn hyphen_in_bare_identifier_reference() {
|
|
let f = parse_formula("Total = east-coast + west-coast", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::BinOp(BinOp::Add, lhs, rhs)
|
|
if matches!(**lhs, Expr::Ref(ref s) if s == "east-coast")
|
|
&& matches!(**rhs, Expr::Ref(ref s) if s == "west-coast")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn hyphen_in_bare_identifier_inside_aggregate() {
|
|
// Same fix, but inside a SUM() to exercise the aggregate path as well.
|
|
let f = parse_formula("X = SUM(east-coast)", "Cat").unwrap();
|
|
assert!(matches!(
|
|
&f.expr,
|
|
Expr::Agg(AggFunc::Sum, inner, None)
|
|
if matches!(**inner, Expr::Ref(ref s) if s == "east-coast")
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn pipe_quoted_target_preserves_pipes() {
|
|
let f = parse_formula("|My Target| = A", "Cat").unwrap();
|
|
assert_eq!(f.target, "|My Target|");
|
|
}
|
|
}
|
|
|
|
// ---- grammar-driven proptests ------------------------------------------
|
|
//
|
|
// Uses pest_meta to read formula.pest at test time and walks the grammar
|
|
// AST to generate random valid formulas. Mirrors the pattern used by the
|
|
// persistence parser tests in src/persistence/mod.rs.
|
|
|
|
#[cfg(test)]
|
|
mod generator {
|
|
use pest_meta::ast::{Expr, RuleType};
|
|
use pest_meta::parser;
|
|
use proptest::prelude::*;
|
|
use std::collections::HashMap;
|
|
|
|
/// Parse formula.pest and return rules keyed by name.
|
|
fn load_grammar() -> HashMap<String, (RuleType, Expr)> {
|
|
let grammar = include_str!("formula.pest");
|
|
let pairs = parser::parse(parser::Rule::grammar_rules, grammar)
|
|
.unwrap_or_else(|e| panic!("Bad grammar: {e}"));
|
|
let rules = parser::consume_rules(pairs).unwrap_or_else(|e| panic!("{e:?}"));
|
|
rules
|
|
.into_iter()
|
|
.map(|r| (r.name.clone(), (r.ty, r.expr)))
|
|
.collect()
|
|
}
|
|
|
|
/// Recursive string generator driven by a pest `Expr`. `choices` is
|
|
/// consumed left-to-right at every decision point; the `atomic` flag
|
|
/// controls whether sequences insert a space between their children,
|
|
/// so that non-atomic grammar rules produce whitespace-separated
|
|
/// tokens that pest's implicit WHITESPACE handling accepts.
|
|
pub struct Gen<'g> {
|
|
rules: &'g HashMap<String, (RuleType, Expr)>,
|
|
choices: Vec<u8>,
|
|
pos: usize,
|
|
}
|
|
|
|
impl<'g> Gen<'g> {
|
|
pub fn new(rules: &'g HashMap<String, (RuleType, Expr)>, choices: Vec<u8>) -> Self {
|
|
Self {
|
|
rules,
|
|
choices,
|
|
pos: 0,
|
|
}
|
|
}
|
|
|
|
fn pick(&mut self) -> u8 {
|
|
let v = self.choices.get(self.pos).copied().unwrap_or(0);
|
|
self.pos += 1;
|
|
v
|
|
}
|
|
|
|
fn emit(&mut self, expr: &Expr, out: &mut String, atomic: bool) {
|
|
match expr {
|
|
Expr::Str(s) => out.push_str(s),
|
|
Expr::Insens(s) => out.push_str(s),
|
|
Expr::Ident(name) => self.emit_ident(name, out, atomic),
|
|
Expr::Seq(a, b) => {
|
|
self.emit(a, out, atomic);
|
|
if !atomic {
|
|
out.push(' ');
|
|
}
|
|
self.emit(b, out, atomic);
|
|
}
|
|
Expr::Choice(a, b) => {
|
|
// 50/50 between the two branches of each Choice.
|
|
// Nested Choices (used by rules with 3+ alternatives)
|
|
// recurse into themselves, so deeper branches are
|
|
// chosen less often than shallower ones — good
|
|
// enough for the random-generation use case.
|
|
if self.pick().is_multiple_of(2) {
|
|
self.emit(a, out, atomic);
|
|
} else {
|
|
self.emit(b, out, atomic);
|
|
}
|
|
}
|
|
Expr::Opt(inner) => {
|
|
// ~66% chance of emitting the optional branch.
|
|
if !self.pick().is_multiple_of(3) {
|
|
self.emit(inner, out, atomic);
|
|
}
|
|
}
|
|
Expr::Rep(inner) => {
|
|
let count = self.pick() % 4;
|
|
for i in 0..count {
|
|
if i > 0 && !atomic {
|
|
out.push(' ');
|
|
}
|
|
self.emit(inner, out, atomic);
|
|
}
|
|
}
|
|
Expr::RepOnce(inner) => {
|
|
// formula.pest only uses `+` inside atomic rules
|
|
// (ASCII_DIGIT+), so inner repetitions never need
|
|
// whitespace separation.
|
|
let count = 1 + self.pick() % 3;
|
|
for _ in 0..count {
|
|
self.emit(inner, out, atomic);
|
|
}
|
|
}
|
|
// Lookaheads (NegPred, PosPred) don't emit output. Any
|
|
// other Expr variant (Range, RepExact, Push, PeekSlice,
|
|
// Skip, …) is unused by formula.pest and silently
|
|
// produces nothing; if the grammar starts using one,
|
|
// generated formulas will fail to parse and we'll know.
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
fn emit_ident(&mut self, name: &str, out: &mut String, atomic: bool) {
|
|
match name {
|
|
"ANY" | "ASCII_ALPHA" => {
|
|
out.push((b'a' + self.pick() % 26) as char);
|
|
}
|
|
"ASCII_ALPHANUMERIC" => {
|
|
if self.pick().is_multiple_of(2) {
|
|
out.push((b'a' + self.pick() % 26) as char);
|
|
} else {
|
|
out.push((b'0' + self.pick() % 10) as char);
|
|
}
|
|
}
|
|
"ASCII_DIGIT" => {
|
|
out.push((b'0' + self.pick() % 10) as char);
|
|
}
|
|
"NEWLINE" => out.push('\n'),
|
|
"SOI" | "EOI" => {}
|
|
_ => {
|
|
// Every Ident in formula.pest refers to a rule that
|
|
// exists in the grammar file — if it doesn't, the
|
|
// grammar itself failed to compile and we wouldn't be
|
|
// running tests, so `expect` here is a bug marker.
|
|
let (ty, inner) = self
|
|
.rules
|
|
.get(name)
|
|
.cloned()
|
|
.expect("rule referenced by grammar exists");
|
|
let inner_atomic =
|
|
atomic || matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
|
|
self.emit(&inner, out, inner_atomic);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn generate(&mut self, rule_name: &str) -> String {
|
|
let mut out = String::new();
|
|
let (ty, expr) = self
|
|
.rules
|
|
.get(rule_name)
|
|
.cloned()
|
|
.expect("entry rule exists in formula.pest");
|
|
let atomic = matches!(ty, RuleType::Atomic | RuleType::CompoundAtomic);
|
|
self.emit(&expr, &mut out, atomic);
|
|
out
|
|
}
|
|
}
|
|
|
|
/// Proptest strategy: generate a random valid formula by walking the
|
|
/// grammar's `formula` rule.
|
|
pub fn formula_string() -> impl Strategy<Value = String> {
|
|
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
|
|
let rules = load_grammar();
|
|
Gen::new(&rules, choices).generate("formula")
|
|
})
|
|
}
|
|
|
|
/// Proptest strategy: generate a random valid standalone expression.
|
|
pub fn expr_string() -> impl Strategy<Value = String> {
|
|
prop::collection::vec(any::<u8>(), 32..=128).prop_map(|choices| {
|
|
let rules = load_grammar();
|
|
Gen::new(&rules, choices).generate("expr_eoi")
|
|
})
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod grammar_prop_tests {
|
|
use super::generator;
|
|
use super::{parse_expr, parse_formula};
|
|
use proptest::prelude::*;
|
|
|
|
proptest! {
|
|
#![proptest_config(ProptestConfig::with_cases(256))]
|
|
|
|
/// Every generator-produced formula parses without error.
|
|
#[test]
|
|
fn generated_formula_parses(formula in generator::formula_string()) {
|
|
let result = parse_formula(&formula, "Cat");
|
|
prop_assert!(
|
|
result.is_ok(),
|
|
"Generated formula failed to parse:\n{}\nError: {}",
|
|
formula,
|
|
result.unwrap_err()
|
|
);
|
|
}
|
|
|
|
/// Every generator-produced standalone expression parses.
|
|
#[test]
|
|
fn generated_expr_parses(expr in generator::expr_string()) {
|
|
let result = parse_expr(&expr);
|
|
prop_assert!(
|
|
result.is_ok(),
|
|
"Generated expression failed to parse:\n{}\nError: {}",
|
|
expr,
|
|
result.unwrap_err()
|
|
);
|
|
}
|
|
|
|
/// Parsing the same input twice is deterministic — the debug
|
|
/// representation of the resulting AST is identical.
|
|
#[test]
|
|
fn parse_is_deterministic(formula in generator::formula_string()) {
|
|
let r1 = parse_formula(&formula, "Cat");
|
|
let r2 = parse_formula(&formula, "Cat");
|
|
prop_assume!(r1.is_ok() && r2.is_ok());
|
|
let f1 = r1.unwrap();
|
|
let f2 = r2.unwrap();
|
|
prop_assert_eq!(&f1.target, &f2.target);
|
|
prop_assert_eq!(format!("{:?}", f1.expr), format!("{:?}", f2.expr));
|
|
}
|
|
}
|
|
}
|