refactor!(formula): migrate parser to use pest

Replace the manual tokenizer and recursive descent parser with a PEG grammar using the pest library. This migration involves introducing a formal grammar in formula.pest and updating the parser implementation to utilize the generated Pest parser with a tree-walking approach to construct the AST. The change introduces a stricter requirement for identifiers: multi-word identifiers must now be enclosed in pipe quotes (e.g., |Total Revenue|) and are no longer accepted as bare words. Tests have been updated to reflect the new parsing logic, remove tokenizer-specific tests, and verify the new pipe-quoting and escape semantics. BREAKING CHANGE: Multi-word identifiers now require pipe-quoting (e.g. |Total Revenue|) and are no longer accepted as bare words. Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
2026-04-15 04:04:57 -07:00
parent 38f83b2417
commit 3f69f88709
4 changed files with 404 additions and 446 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -756,6 +756,8 @@ name = "improvise-formula"
 version = "0.1.0-rc2"
 dependencies = [
 "anyhow",
 "pest",
 "pest_derive",
 "serde",
 ]
--- a/crates/improvise-formula/Cargo.toml
+++ b/crates/improvise-formula/Cargo.toml
@ -8,4 +8,6 @@ repository = "https://github.com/fiddlerwoaroof/improvise"
 [dependencies]
 anyhow = "1"
 pest = "2.8.6"
 pest_derive = "2.8.6"
 serde = { version = "1", features = ["derive"] }
--- a/crates/improvise-formula/src/formula.pest
+++ b/crates/improvise-formula/src/formula.pest
@ -0,0 +1,91 @@
 // Formula grammar for improvise.
 //
 // A formula has the form:   TARGET = EXPR [WHERE filter]
 // See parser.rs for the tree walker that produces a Formula AST.
 //
 // Identifier rules (bare_ident / pipe_quoted) mirror `bare_name` and
 // `pipe_quoted` in src/persistence/improv.pest: bare identifiers are
 // alphanumeric plus `_` and `-`, with no internal spaces; multi-word
 // names must be pipe-quoted.
 // Auto-skip horizontal whitespace between tokens in non-atomic rules.
 WHITESPACE = _{ " " | "\t" }
 // ---- top-level ----------------------------------------------------------
 formula = { SOI ~ target ~ "=" ~ expr ~ where_clause? ~ EOI }
 // The target keeps its raw text (including pipes, if any) — we capture
 // the span directly rather than walking into its children.
 target = { identifier }
 where_clause = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
 // ---- expressions --------------------------------------------------------
 // Used by parse_expr() — forces a standalone expression to consume the
 // whole input, so `1 + 2 3` fails instead of silently dropping " 3".
 expr_eoi = { SOI ~ expr ~ EOI }
 expr = { add_expr }
 add_expr = { mul_expr ~ (add_op ~ mul_expr)* }
 add_op   = { "+" | "-" }
 mul_expr = { pow_expr ~ (mul_op ~ pow_expr)* }
 mul_op   = { "*" | "/" }
 pow_expr = { unary ~ (pow_op ~ unary)? }
 pow_op   = { "^" }
 unary       = { unary_minus | primary }
 unary_minus = { "-" ~ primary }
 primary = {
      number
    | agg_call
    | if_expr
    | paren_expr
    | ref_expr
 }
 paren_expr = { "(" ~ expr ~ ")" }
 // Aggregates with optional inline WHERE filter inside the parens.
 agg_call     = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
 agg_func     = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
 inline_where = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
 // IF(cond, then, else).  Comparison is a standalone rule because comparison
 // operators are not valid in general expressions — only inside an IF condition.
 if_expr    = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
 comparison = { expr ~ cmp_op ~ expr }
 cmp_op     = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
 // A reference to an item.  `SUM` and `IF` without parens fall through to
 // this rule because agg_call / if_expr require a "(" and otherwise fail.
 ref_expr = { identifier }
 // ---- identifiers --------------------------------------------------------
 //
 // Mirror of improv.pest's bare_name / pipe_quoted.
 identifier = ${ pipe_quoted | bare_ident }
 // Backslash escapes inside pipes: \| literal pipe, \\ backslash, \n newline.
 pipe_quoted = @{ "|" ~ ("\\" ~ ANY | !"|" ~ ANY)* ~ "|" }
 bare_ident = @{
    (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")*
 }
 // ---- literal values -----------------------------------------------------
 filter_value = { string | pipe_quoted | bare_ident }
 string = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" }
 number = @{
      ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)?
    | "." ~ ASCII_DIGIT+
 }
--- a/crates/improvise-formula/src/parser.rs
+++ b/crates/improvise-formula/src/parser.rs
@ -1,462 +1,321 @@
 use anyhow::{Result, anyhow};
 use pest::Parser as _;
 use pest::iterators::Pair;
 use pest_derive::Parser;
 use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
 #[derive(Parser)]
 #[grammar = "formula.pest"]
 struct FormulaParser;
 /// Parse a formula string like "Profit = Revenue - Cost"
 /// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
 pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
-    let raw = raw.trim();
+    let input = raw.trim();
    let formula_pair = FormulaParser::parse(Rule::formula, input)
        .map_err(|e| anyhow!("{}", e))?
        .next()
        .ok_or_else(|| anyhow!("empty parse result"))?;
    build_formula(formula_pair, input, target_category)
 }
-    // Split on first `=` to get target = expression
+/// Parse a bare expression (no target, no top-level WHERE clause).
-    let eq_pos = raw
+/// Fails if the input contains trailing tokens after a complete expression.
-        .find('=')
+pub fn parse_expr(s: &str) -> Result<Expr> {
-        .ok_or_else(|| anyhow!("Formula must contain '=': {raw}"))?;
+    let input = s.trim();
-    let target = raw[..eq_pos].trim().to_string();
+    let expr_pair = FormulaParser::parse(Rule::expr_eoi, input)
-    let rest = raw[eq_pos + 1..].trim();
+        .map_err(|e| anyhow!("{}", e))?
        .next()
        .ok_or_else(|| anyhow!("empty parse result"))?
        .into_inner()
        .next()
        .ok_or_else(|| anyhow!("missing expression in expr_eoi"))?;
    build_expr(expr_pair)
 }
-    // Check for WHERE clause at top level
+// ---- tree walkers -------------------------------------------------------
    let (expr_str, filter) = split_where(rest);
    let filter = filter.map(parse_where).transpose()?;
    let expr = parse_expr(expr_str.trim())?;
 fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Result<Formula> {
    let mut target = None;
    let mut expr = None;
    let mut filter = None;
    for inner in pair.into_inner() {
        match inner.as_rule() {
            Rule::target => target = Some(inner.as_str().trim().to_string()),
            Rule::expr => expr = Some(build_expr(inner)?),
            Rule::where_clause => filter = Some(build_filter(inner)?),
            Rule::EOI => {}
            r => return Err(anyhow!("unexpected rule in formula: {:?}", r)),
        }
    }
    let target = target.ok_or_else(|| anyhow!("missing target in formula"))?;
    let expr = expr.ok_or_else(|| anyhow!("missing expression in formula"))?;
    Ok(Formula::new(raw, target, target_category, expr, filter))
 }
-fn split_where(s: &str) -> (&str, Option<&str>) {
+fn build_expr(pair: Pair<Rule>) -> Result<Expr> {
-    // Find WHERE not inside parens or quotes
+    // expr = { add_expr }
-    let bytes = s.as_bytes();
+    build_add_expr(first_inner(pair, "expr")?)
    let mut depth = 0i32;
    let mut i = 0;
    while i < bytes.len() {
        match bytes[i] {
            b'(' => depth += 1,
            b')' => depth -= 1,
            b'"' => {
                i += 1;
                while i < bytes.len() && bytes[i] != b'"' {
                    i += 1;
                }
            }
            b'|' => {
                i += 1;
                while i < bytes.len() && bytes[i] != b'|' {
                    i += 1;
                }
            }
            _ if depth == 0 => {
                if s[i..].to_ascii_uppercase().starts_with("WHERE") {
                    let before = &s[..i];
                    let after = &s[i + 5..];
                    if before.ends_with(char::is_whitespace) || i == 0 {
                        return (before.trim(), Some(after.trim()));
                    }
                }
            }
            _ => {}
        }
        i += 1;
    }
    (s, None)
 }
-/// Strip pipe or double-quote delimiters from a value.
+fn build_add_expr(pair: Pair<Rule>) -> Result<Expr> {
-fn unquote(s: &str) -> String {
+    fold_left_binop(pair, build_mul_expr, |s| match s {
-    let s = s.trim();
+        "+" => Some(BinOp::Add),
-    if (s.starts_with('"') && s.ends_with('"')) || (s.starts_with('|') && s.ends_with('|')) {
+        "-" => Some(BinOp::Sub),
-        s[1..s.len() - 1].to_string()
+        _ => None,
    })
 }
 fn build_mul_expr(pair: Pair<Rule>) -> Result<Expr> {
    fold_left_binop(pair, build_pow_expr, |s| match s {
        "*" => Some(BinOp::Mul),
        "/" => Some(BinOp::Div),
        _ => None,
    })
 }
 fn build_pow_expr(pair: Pair<Rule>) -> Result<Expr> {
    // pow_expr = { unary ~ (pow_op ~ unary)? }
    let mut pairs = pair.into_inner();
    let base_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("empty pow_expr"))?;
    let base = build_unary(base_pair)?;
    match pairs.next() {
        None => Ok(base),
        Some(op_pair) => {
            debug_assert_eq!(op_pair.as_rule(), Rule::pow_op);
            let exp_pair = pairs
                .next()
                .ok_or_else(|| anyhow!("missing exponent in pow_expr"))?;
            let exp = build_unary(exp_pair)?;
            Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)))
        }
    }
 }
 fn build_unary(pair: Pair<Rule>) -> Result<Expr> {
    // unary = { unary_minus | primary }
    let inner = first_inner(pair, "unary")?;
    match inner.as_rule() {
        Rule::unary_minus => {
            let prim = first_inner(inner, "unary_minus")?;
            Ok(Expr::UnaryMinus(Box::new(build_primary(prim)?)))
        }
        Rule::primary => build_primary(inner),
        r => Err(anyhow!("unexpected rule in unary: {:?}", r)),
    }
 }
 fn build_primary(pair: Pair<Rule>) -> Result<Expr> {
    // primary = { number | agg_call | if_expr | paren_expr | ref_expr }
    let inner = first_inner(pair, "primary")?;
    match inner.as_rule() {
        Rule::number => Ok(Expr::Number(inner.as_str().parse()?)),
        Rule::agg_call => build_agg_call(inner),
        Rule::if_expr => build_if_expr(inner),
        Rule::paren_expr => build_expr(first_inner(inner, "paren_expr")?),
        Rule::ref_expr => {
            let id_pair = first_inner(inner, "ref_expr")?;
            Ok(Expr::Ref(identifier_to_string(id_pair)))
        }
        r => Err(anyhow!("unexpected rule in primary: {:?}", r)),
    }
 }
 fn build_agg_call(pair: Pair<Rule>) -> Result<Expr> {
    // agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
    let mut pairs = pair.into_inner();
    let func_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing agg_func"))?;
    let func = parse_agg_func(func_pair.as_str())?;
    let expr_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing aggregate argument"))?;
    let inner_expr = build_expr(expr_pair)?;
    let filter = match pairs.next() {
        Some(p) if p.as_rule() == Rule::inline_where => Some(build_filter(p)?),
        _ => None,
    };
    Ok(Expr::Agg(func, Box::new(inner_expr), filter))
 }
 fn parse_agg_func(s: &str) -> Result<AggFunc> {
    match s.to_ascii_uppercase().as_str() {
        "SUM" => Ok(AggFunc::Sum),
        "AVG" => Ok(AggFunc::Avg),
        "MIN" => Ok(AggFunc::Min),
        "MAX" => Ok(AggFunc::Max),
        "COUNT" => Ok(AggFunc::Count),
        f => Err(anyhow!("unknown aggregate function: {}", f)),
    }
 }
 fn build_if_expr(pair: Pair<Rule>) -> Result<Expr> {
    // if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
    let mut pairs = pair.into_inner();
    let cond_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing IF condition"))?;
    let cond = build_comparison(cond_pair)?;
    let then_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing IF then-branch"))?;
    let then_e = build_expr(then_pair)?;
    let else_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing IF else-branch"))?;
    let else_e = build_expr(else_pair)?;
    Ok(Expr::If(
        Box::new(cond),
        Box::new(then_e),
        Box::new(else_e),
    ))
 }
 fn build_comparison(pair: Pair<Rule>) -> Result<Expr> {
    // comparison = { expr ~ cmp_op ~ expr }
    let mut pairs = pair.into_inner();
    let lhs_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing comparison lhs"))?;
    let lhs = build_expr(lhs_pair)?;
    let op_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing comparison operator"))?;
    let op = parse_cmp_op(op_pair.as_str())?;
    let rhs_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing comparison rhs"))?;
    let rhs = build_expr(rhs_pair)?;
    Ok(Expr::BinOp(op, Box::new(lhs), Box::new(rhs)))
 }
 fn parse_cmp_op(s: &str) -> Result<BinOp> {
    match s {
        "=" => Ok(BinOp::Eq),
        "!=" => Ok(BinOp::Ne),
        "<" => Ok(BinOp::Lt),
        ">" => Ok(BinOp::Gt),
        "<=" => Ok(BinOp::Le),
        ">=" => Ok(BinOp::Ge),
        o => Err(anyhow!("unknown comparison operator: {}", o)),
    }
 }
 fn build_filter(pair: Pair<Rule>) -> Result<Filter> {
    // where_clause / inline_where both have shape:
    //   ^"WHERE" ~ identifier ~ "=" ~ filter_value
    let mut pairs = pair.into_inner();
    let cat_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing WHERE category"))?;
    let category = identifier_to_string(cat_pair);
    let val_pair = pairs
        .next()
        .ok_or_else(|| anyhow!("missing WHERE value"))?;
    let item = filter_value_to_string(val_pair);
    Ok(Filter { category, item })
 }
 fn filter_value_to_string(pair: Pair<Rule>) -> String {
    // filter_value = { string | pipe_quoted | bare_ident }
    let inner = pair
        .into_inner()
        .next()
        .expect("filter_value must have an inner pair");
    let s = inner.as_str();
    match inner.as_rule() {
        Rule::string => strip_string_quotes(s),
        Rule::pipe_quoted => unquote_pipe(s),
        _ => s.to_string(),
    }
 }
 /// Convert an identifier pair (identifier, pipe_quoted, or bare_ident) to
 /// its content string.  Pipe-quoted identifiers have their delimiters
 /// stripped and backslash escapes applied; bare identifiers are returned
 /// verbatim.
 fn identifier_to_string(pair: Pair<Rule>) -> String {
    let s = pair.as_str();
    if is_pipe_quoted(s) {
        unquote_pipe(s)
    } else {
        s.to_string()
    }
 }
-fn parse_where(s: &str) -> Result<Filter> {
+fn is_pipe_quoted(s: &str) -> bool {
-    // Format: Category = "Item" or Category = |Item| or Category = Item
+    s.len() >= 2 && s.starts_with('|') && s.ends_with('|')
    let eq_pos = s
        .find('=')
        .ok_or_else(|| anyhow!("WHERE clause must contain '=': {s}"))?;
    let category = unquote(&s[..eq_pos]);
    let item = unquote(&s[eq_pos + 1..]);
    Ok(Filter { category, item })
 }
-/// Parse an expression using recursive descent
+fn strip_string_quotes(s: &str) -> String {
-pub fn parse_expr(s: &str) -> Result<Expr> {
+    debug_assert!(s.len() >= 2 && s.starts_with('"') && s.ends_with('"'));
-    let tokens = tokenize(s)?;
+    s[1..s.len() - 1].to_string()
    let mut pos = 0;
    let expr = parse_add_sub(&tokens, &mut pos)?;
    if pos < tokens.len() {
        return Err(anyhow!(
            "Unexpected token at position {pos}: {:?}",
            tokens[pos]
        ));
    }
    Ok(expr)
 }
-#[derive(Debug, Clone, PartialEq)]
+/// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
-enum Token {
+/// `\\` → `\`, `\n` → newline.  Matches the escape semantics documented
-    Number(f64),
+/// in src/persistence/improv.pest.
-    Ident(String),
+fn unquote_pipe(s: &str) -> String {
-    Str(String),
+    debug_assert!(is_pipe_quoted(s));
-    Plus,
+    let inner = &s[1..s.len() - 1];
-    Minus,
+    let mut out = String::with_capacity(inner.len());
-    Star,
+    let mut chars = inner.chars();
-    Slash,
+    while let Some(c) = chars.next() {
-    Caret,
+        if c == '\\' {
-    LParen,
+            match chars.next() {
-    RParen,
+                Some('|') => out.push('|'),
-    Comma,
+                Some('\\') => out.push('\\'),
-    Eq,
+                Some('n') => out.push('\n'),
-    Ne,
+                Some(other) => {
-    Lt,
+                    out.push('\\');
-    Gt,
+                    out.push(other);
    Le,
    Ge,
 }
 fn tokenize(s: &str) -> Result<Vec<Token>> {
    let mut tokens = Vec::new();
    let chars: Vec<char> = s.chars().collect();
    let mut i = 0;
    while i < chars.len() {
        match chars[i] {
            ' ' | '\t' | '\n' => i += 1,
            '+' => {
                tokens.push(Token::Plus);
                i += 1;
            }
            '-' => {
                tokens.push(Token::Minus);
                i += 1;
            }
            '*' => {
                tokens.push(Token::Star);
                i += 1;
            }
            '/' => {
                tokens.push(Token::Slash);
                i += 1;
            }
            '^' => {
                tokens.push(Token::Caret);
                i += 1;
            }
            '(' => {
                tokens.push(Token::LParen);
                i += 1;
            }
            ')' => {
                tokens.push(Token::RParen);
                i += 1;
            }
            ',' => {
                tokens.push(Token::Comma);
                i += 1;
            }
            '!' if chars.get(i + 1) == Some(&'=') => {
                tokens.push(Token::Ne);
                i += 2;
            }
            '<' if chars.get(i + 1) == Some(&'=') => {
                tokens.push(Token::Le);
                i += 2;
            }
            '>' if chars.get(i + 1) == Some(&'=') => {
                tokens.push(Token::Ge);
                i += 2;
            }
            '<' => {
                tokens.push(Token::Lt);
                i += 1;
            }
            '>' => {
                tokens.push(Token::Gt);
                i += 1;
            }
            '=' => {
                tokens.push(Token::Eq);
                i += 1;
            }
            '"' => {
                i += 1;
                let mut s = String::new();
                while i < chars.len() && chars[i] != '"' {
                    s.push(chars[i]);
                    i += 1;
                }
-                if i < chars.len() {
+                None => out.push('\\'),
                    i += 1;
                }
                tokens.push(Token::Str(s));
            }
-            '|' => {
+        } else {
-                i += 1;
+            out.push(c);
                let mut s = String::new();
                while i < chars.len() && chars[i] != '|' {
                    s.push(chars[i]);
                    i += 1;
                }
                if i < chars.len() {
                    i += 1;
                }
                tokens.push(Token::Ident(s));
            }
            c if c.is_ascii_digit() || c == '.' => {
                let mut num = String::new();
                while i < chars.len() && (chars[i].is_ascii_digit() || chars[i] == '.') {
                    num.push(chars[i]);
                    i += 1;
                }
                tokens.push(Token::Number(num.parse()?));
            }
            c if c.is_alphabetic() || c == '_' => {
                let mut ident = String::new();
                while i < chars.len()
                    && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == ' ')
                {
                    // Don't consume trailing spaces if next non-space is operator
                    if chars[i] == ' ' {
                        // Peek ahead past spaces to find the next word/token
                        let j = i + 1;
                        let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
                        if matches!(
                            next_nonspace,
                            Some('+')
                                | Some('-')
                                | Some('*')
                                | Some('/')
                                | Some('^')
                                | Some(')')
                                | Some(',')
                                | Some('<')
                                | Some('>')
                                | Some('=')
                                | Some('!')
                                | Some('"')
                                | None
                        ) {
                            break;
                        }
                        // Break if the identifier collected so far is a keyword
                        let trimmed = ident.trim_end().to_ascii_uppercase();
                        if matches!(
                            trimmed.as_str(),
                            "WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
                        ) {
                            break;
                        }
                        // Also break if the next word is a keyword
                        let rest: String = chars[j..].iter().collect();
                        let next_word: String = rest
                            .trim_start()
                            .chars()
                            .take_while(|c| c.is_alphanumeric() || *c == '_')
                            .collect();
                        let upper = next_word.to_ascii_uppercase();
                        if matches!(
                            upper.as_str(),
                            "WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
                        ) {
                            break;
                        }
                    }
                    ident.push(chars[i]);
                    i += 1;
                }
                let ident = ident.trim_end().to_string();
                tokens.push(Token::Ident(ident));
            }
            c => return Err(anyhow!("Unexpected character '{c}' in expression")),
        }
    }
-    Ok(tokens)
+    out
 }
-fn parse_add_sub(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
+// ---- small helpers ------------------------------------------------------
-    let mut left = parse_mul_div(tokens, pos)?;
+
-    while *pos < tokens.len() {
+fn first_inner<'a>(pair: Pair<'a, Rule>, ctx: &str) -> Result<Pair<'a, Rule>> {
-        let op = match &tokens[*pos] {
+    pair.into_inner()
-            Token::Plus => BinOp::Add,
+        .next()
-            Token::Minus => BinOp::Sub,
+        .ok_or_else(|| anyhow!("empty rule: {}", ctx))
-            _ => break,
+}
-        };
+
-        *pos += 1;
+/// Fold a left-associative binary-operator rule of the shape
-        let right = parse_mul_div(tokens, pos)?;
+/// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
 fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Result<Expr>
 where
    F: FnMut(Pair<Rule>) -> Result<Expr>,
    M: Fn(&str) -> Option<BinOp>,
 {
    let mut pairs = pair.into_inner();
    let first = pairs
        .next()
        .ok_or_else(|| anyhow!("empty binop rule"))?;
    let mut left = build_child(first)?;
    while let Some(op_pair) = pairs.next() {
        let op = match_op(op_pair.as_str()).ok_or_else(|| {
            anyhow!("unexpected operator token: {:?}", op_pair.as_str())
        })?;
        let right_pair = pairs
            .next()
            .ok_or_else(|| anyhow!("missing rhs for operator"))?;
        let right = build_child(right_pair)?;
        left = Expr::BinOp(op, Box::new(left), Box::new(right));
    }
    Ok(left)
 }
 fn parse_mul_div(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
    let mut left = parse_pow(tokens, pos)?;
    while *pos < tokens.len() {
        let op = match &tokens[*pos] {
            Token::Star => BinOp::Mul,
            Token::Slash => BinOp::Div,
            _ => break,
        };
        *pos += 1;
        let right = parse_pow(tokens, pos)?;
        left = Expr::BinOp(op, Box::new(left), Box::new(right));
    }
    Ok(left)
 }
 fn parse_pow(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
    let base = parse_unary(tokens, pos)?;
    if *pos < tokens.len() && tokens[*pos] == Token::Caret {
        *pos += 1;
        let exp = parse_unary(tokens, pos)?;
        return Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)));
    }
    Ok(base)
 }
 fn parse_unary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
    if *pos < tokens.len() && tokens[*pos] == Token::Minus {
        *pos += 1;
        let e = parse_primary(tokens, pos)?;
        return Ok(Expr::UnaryMinus(Box::new(e)));
    }
    parse_primary(tokens, pos)
 }
 fn parse_primary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
    if *pos >= tokens.len() {
        return Err(anyhow!("Unexpected end of expression"));
    }
    match &tokens[*pos].clone() {
        Token::Number(n) => {
            *pos += 1;
            Ok(Expr::Number(*n))
        }
        Token::Ident(name) => {
            let name = name.clone();
            *pos += 1;
            // Check for function call
            let upper = name.to_ascii_uppercase();
            match upper.as_str() {
                "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" => {
                    let func = match upper.as_str() {
                        "SUM" => AggFunc::Sum,
                        "AVG" => AggFunc::Avg,
                        "MIN" => AggFunc::Min,
                        "MAX" => AggFunc::Max,
                        "COUNT" => AggFunc::Count,
                        _ => unreachable!(),
                    };
                    if *pos < tokens.len() && tokens[*pos] == Token::LParen {
                        *pos += 1;
                        let inner = parse_add_sub(tokens, pos)?;
                        // Optional WHERE filter
                        let filter = if *pos < tokens.len() {
                            if let Token::Ident(kw) = &tokens[*pos] {
                                if kw.eq_ignore_ascii_case("WHERE") {
                                    *pos += 1;
                                    let cat = match &tokens[*pos] {
                                        Token::Ident(s) => {
                                            let s = s.clone();
                                            *pos += 1;
                                            s
                                        }
                                        t => {
                                            return Err(anyhow!(
                                                "Expected category name, got {t:?}"
                                            ));
                                        }
                                    };
                                    // expect =
                                    if *pos < tokens.len() && tokens[*pos] == Token::Eq {
                                        *pos += 1;
                                    }
                                    let item = match &tokens[*pos] {
                                        Token::Str(s) | Token::Ident(s) => {
                                            let s = s.clone();
                                            *pos += 1;
                                            s
                                        }
                                        t => return Err(anyhow!("Expected item name, got {t:?}")),
                                    };
                                    Some(Filter {
                                        category: cat,
                                        item,
                                    })
                                } else {
                                    None
                                }
                            } else {
                                None
                            }
                        } else {
                            None
                        };
                        // expect )
                        if *pos < tokens.len() && tokens[*pos] == Token::RParen {
                            *pos += 1;
                        } else {
                            return Err(anyhow!("Expected ')' to close aggregate function"));
                        }
                        return Ok(Expr::Agg(func, Box::new(inner), filter));
                    }
                    Ok(Expr::Ref(name))
                }
                "IF" => {
                    if *pos < tokens.len() && tokens[*pos] == Token::LParen {
                        *pos += 1;
                        let cond = parse_comparison(tokens, pos)?;
                        if *pos < tokens.len() && tokens[*pos] == Token::Comma {
                            *pos += 1;
                        }
                        let then = parse_add_sub(tokens, pos)?;
                        if *pos < tokens.len() && tokens[*pos] == Token::Comma {
                            *pos += 1;
                        }
                        let else_ = parse_add_sub(tokens, pos)?;
                        if *pos < tokens.len() && tokens[*pos] == Token::RParen {
                            *pos += 1;
                        } else {
                            return Err(anyhow!("Expected ')' to close IF(...)"));
                        }
                        return Ok(Expr::If(Box::new(cond), Box::new(then), Box::new(else_)));
                    }
                    Ok(Expr::Ref(name))
                }
                _ => Ok(Expr::Ref(name)),
            }
        }
        Token::LParen => {
            *pos += 1;
            let e = parse_add_sub(tokens, pos)?;
            if *pos < tokens.len() && tokens[*pos] == Token::RParen {
                *pos += 1;
            }
            Ok(e)
        }
        t => Err(anyhow!("Unexpected token in expression: {t:?}")),
    }
 }
 fn parse_comparison(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
    let left = parse_add_sub(tokens, pos)?;
    if *pos >= tokens.len() {
        return Ok(left);
    }
    let op = match &tokens[*pos] {
        Token::Eq => BinOp::Eq,
        Token::Ne => BinOp::Ne,
        Token::Lt => BinOp::Lt,
        Token::Gt => BinOp::Gt,
        Token::Le => BinOp::Le,
        Token::Ge => BinOp::Ge,
        _ => return Ok(left),
    };
    *pos += 1;
    let right = parse_add_sub(tokens, pos)?;
    Ok(Expr::BinOp(op, Box::new(left), Box::new(right)))
 }
 #[cfg(test)]
 mod tests {
    use super::parse_formula;
@ -544,8 +403,8 @@ mod tests {
        assert_eq!(filter.item, "East");
    }
-    /// Regression: WHERE inside aggregate parens must tokenize correctly.
+    /// Regression: WHERE inside aggregate parens must parse as the
-    /// The tokenizer must not merge "Revenue WHERE" into a single identifier.
+    /// aggregate's inline filter, not as a top-level WHERE clause.
    #[test]
    fn parse_sum_with_inline_where_filter() {
        let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap();
@ -562,7 +421,6 @@ mod tests {
    #[test]
    fn parse_if_with_comparison_operators() {
        // Test each comparison operator in an IF expression
        let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap();
        assert!(matches!(f.expr, Expr::If(_, _, _)));
@ -583,7 +441,6 @@ mod tests {
    #[test]
    fn parse_where_with_quoted_string_inside_expression() {
        // WHERE inside a formula string with quotes
        let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap();
        let filter = f.filter.as_ref().unwrap();
        assert_eq!(filter.item, "West Coast");
@ -650,11 +507,10 @@ mod tests {
        }
    }
-    // ── Quoted string in tokenizer ──────────────────────────────────────
+    // ── Quoted string in WHERE ──────────────────────────────────────────
    #[test]
    fn parse_quoted_string_in_where() {
        // Quoted strings work in top-level WHERE clauses
        let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap();
        let filter = f.filter.as_ref().unwrap();
        assert_eq!(filter.item, "East");
@ -681,27 +537,21 @@ mod tests {
        assert!(parse_expr("").is_err());
    }
    // ── Multi-word identifiers must be pipe-quoted ──────────────────────
    #[test]
-    fn tokenizer_breaks_at_where_keyword() {
+    fn multi_word_bare_identifier_is_rejected() {
-        use super::tokenize;
+        // Multi-word identifiers must be pipe-quoted; bare multi-word fails
-        let tokens = tokenize("Revenue WHERE Region").unwrap();
+        // the `bare_name`-compatible grammar rule.
-        // Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region")
+        assert!(parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").is_err());
        assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
    }
-    // ── Multi-word identifiers ──────────────────────────────────────────
+    // ── WHERE inside quotes in the expression ───────────────────────────
    #[test]
-    fn parse_multi_word_identifier() {
+    fn where_inside_quotes_is_not_a_keyword() {
-        let f = parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").unwrap();
+        // A filter value containing the literal text "WHERE" is parsed as
-        assert_eq!(f.target, "Total Revenue");
+        // a string, not as a nested WHERE keyword.
    }
    // ── WHERE inside quotes in split_where ──────────────────────────────
    #[test]
    fn split_where_ignores_where_inside_quotes() {
        // WHERE inside quotes should not be treated as a keyword
        let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap();
        let filter = f.filter.as_ref().unwrap();
        assert_eq!(filter.item, "WHERE");
@ -773,4 +623,17 @@ mod tests {
            panic!("Expected SUM with WHERE filter, got: {:?}", f.expr);
        }
    }
    // ── Pipe-quoted escape semantics ────────────────────────────────────
    #[test]
    fn pipe_quoted_escape_literal_pipe() {
        // \| inside a pipe-quoted identifier is a literal pipe
        let f = parse_formula("X = |A\\|B|", "Cat").unwrap();
        if let Expr::Ref(ref s) = f.expr {
            assert_eq!(s, "A|B");
        } else {
            panic!("Expected Ref, got: {:?}", f.expr);
        }
    }
 }