refactor!(formula): migrate parser to use pest

Replace the manual tokenizer and recursive descent parser with a PEG
grammar using the pest library.

This migration involves introducing a formal grammar in formula.pest and
updating the parser implementation to utilize the generated Pest parser
with a tree-walking approach to construct the AST.

The change introduces a stricter requirement for identifiers: multi-word
identifiers must now be enclosed in pipe quotes (e.g., |Total Revenue|) and
are no longer accepted as bare words.

Tests have been updated to reflect the new parsing logic, remove
tokenizer-specific tests, and verify the new pipe-quoting and escape
semantics.

BREAKING CHANGE: Multi-word identifiers now require pipe-quoting (e.g. |Total Revenue|) and
are no longer accepted as bare words.
Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
This commit is contained in:
Edward Langley
2026-04-15 04:04:57 -07:00
parent 38f83b2417
commit 3f69f88709
4 changed files with 404 additions and 446 deletions

2
Cargo.lock generated
View File

@ -756,6 +756,8 @@ name = "improvise-formula"
version = "0.1.0-rc2" version = "0.1.0-rc2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"pest",
"pest_derive",
"serde", "serde",
] ]

View File

@ -8,4 +8,6 @@ repository = "https://github.com/fiddlerwoaroof/improvise"
[dependencies] [dependencies]
anyhow = "1" anyhow = "1"
pest = "2.8.6"
pest_derive = "2.8.6"
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }

View File

@ -0,0 +1,91 @@
// Formula grammar for improvise.
//
// A formula has the form: TARGET = EXPR [WHERE filter]
// See parser.rs for the tree walker that produces a Formula AST.
//
// Identifier rules (bare_ident / pipe_quoted) mirror `bare_name` and
// `pipe_quoted` in src/persistence/improv.pest: bare identifiers are
// alphanumeric plus `_` and `-`, with no internal spaces; multi-word
// names must be pipe-quoted.
// Auto-skip horizontal whitespace between tokens in non-atomic rules.
WHITESPACE = _{ " " | "\t" }
// ---- top-level ----------------------------------------------------------
formula = { SOI ~ target ~ "=" ~ expr ~ where_clause? ~ EOI }
// The target keeps its raw text (including pipes, if any) — we capture
// the span directly rather than walking into its children.
target = { identifier }
where_clause = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
// ---- expressions --------------------------------------------------------
// Used by parse_expr() — forces a standalone expression to consume the
// whole input, so `1 + 2 3` fails instead of silently dropping " 3".
expr_eoi = { SOI ~ expr ~ EOI }
expr = { add_expr }
add_expr = { mul_expr ~ (add_op ~ mul_expr)* }
add_op = { "+" | "-" }
mul_expr = { pow_expr ~ (mul_op ~ pow_expr)* }
mul_op = { "*" | "/" }
pow_expr = { unary ~ (pow_op ~ unary)? }
pow_op = { "^" }
unary = { unary_minus | primary }
unary_minus = { "-" ~ primary }
primary = {
number
| agg_call
| if_expr
| paren_expr
| ref_expr
}
paren_expr = { "(" ~ expr ~ ")" }
// Aggregates with optional inline WHERE filter inside the parens.
agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
inline_where = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
// IF(cond, then, else). Comparison is a standalone rule because comparison
// operators are not valid in general expressions — only inside an IF condition.
if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
comparison = { expr ~ cmp_op ~ expr }
cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
// A reference to an item. `SUM` and `IF` without parens fall through to
// this rule because agg_call / if_expr require a "(" and otherwise fail.
ref_expr = { identifier }
// ---- identifiers --------------------------------------------------------
//
// Mirror of improv.pest's bare_name / pipe_quoted.
identifier = ${ pipe_quoted | bare_ident }
// Backslash escapes inside pipes: \| literal pipe, \\ backslash, \n newline.
pipe_quoted = @{ "|" ~ ("\\" ~ ANY | !"|" ~ ANY)* ~ "|" }
bare_ident = @{
(ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")*
}
// ---- literal values -----------------------------------------------------
filter_value = { string | pipe_quoted | bare_ident }
string = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" }
number = @{
ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)?
| "." ~ ASCII_DIGIT+
}

View File

@ -1,462 +1,321 @@
use anyhow::{Result, anyhow}; use anyhow::{Result, anyhow};
use pest::Parser as _;
use pest::iterators::Pair;
use pest_derive::Parser;
use super::ast::{AggFunc, BinOp, Expr, Filter, Formula}; use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
#[derive(Parser)]
#[grammar = "formula.pest"]
struct FormulaParser;
/// Parse a formula string like "Profit = Revenue - Cost" /// Parse a formula string like "Profit = Revenue - Cost"
/// or "Tax = Revenue * 0.08 WHERE Region = \"East\"" /// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> { pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
let raw = raw.trim(); let input = raw.trim();
let formula_pair = FormulaParser::parse(Rule::formula, input)
.map_err(|e| anyhow!("{}", e))?
.next()
.ok_or_else(|| anyhow!("empty parse result"))?;
build_formula(formula_pair, input, target_category)
}
// Split on first `=` to get target = expression /// Parse a bare expression (no target, no top-level WHERE clause).
let eq_pos = raw /// Fails if the input contains trailing tokens after a complete expression.
.find('=') pub fn parse_expr(s: &str) -> Result<Expr> {
.ok_or_else(|| anyhow!("Formula must contain '=': {raw}"))?; let input = s.trim();
let target = raw[..eq_pos].trim().to_string(); let expr_pair = FormulaParser::parse(Rule::expr_eoi, input)
let rest = raw[eq_pos + 1..].trim(); .map_err(|e| anyhow!("{}", e))?
.next()
.ok_or_else(|| anyhow!("empty parse result"))?
.into_inner()
.next()
.ok_or_else(|| anyhow!("missing expression in expr_eoi"))?;
build_expr(expr_pair)
}
// Check for WHERE clause at top level // ---- tree walkers -------------------------------------------------------
let (expr_str, filter) = split_where(rest);
let filter = filter.map(parse_where).transpose()?;
let expr = parse_expr(expr_str.trim())?;
fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Result<Formula> {
let mut target = None;
let mut expr = None;
let mut filter = None;
for inner in pair.into_inner() {
match inner.as_rule() {
Rule::target => target = Some(inner.as_str().trim().to_string()),
Rule::expr => expr = Some(build_expr(inner)?),
Rule::where_clause => filter = Some(build_filter(inner)?),
Rule::EOI => {}
r => return Err(anyhow!("unexpected rule in formula: {:?}", r)),
}
}
let target = target.ok_or_else(|| anyhow!("missing target in formula"))?;
let expr = expr.ok_or_else(|| anyhow!("missing expression in formula"))?;
Ok(Formula::new(raw, target, target_category, expr, filter)) Ok(Formula::new(raw, target, target_category, expr, filter))
} }
fn split_where(s: &str) -> (&str, Option<&str>) { fn build_expr(pair: Pair<Rule>) -> Result<Expr> {
// Find WHERE not inside parens or quotes // expr = { add_expr }
let bytes = s.as_bytes(); build_add_expr(first_inner(pair, "expr")?)
let mut depth = 0i32;
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b'(' => depth += 1,
b')' => depth -= 1,
b'"' => {
i += 1;
while i < bytes.len() && bytes[i] != b'"' {
i += 1;
}
}
b'|' => {
i += 1;
while i < bytes.len() && bytes[i] != b'|' {
i += 1;
}
}
_ if depth == 0 => {
if s[i..].to_ascii_uppercase().starts_with("WHERE") {
let before = &s[..i];
let after = &s[i + 5..];
if before.ends_with(char::is_whitespace) || i == 0 {
return (before.trim(), Some(after.trim()));
}
}
}
_ => {}
}
i += 1;
}
(s, None)
} }
/// Strip pipe or double-quote delimiters from a value. fn build_add_expr(pair: Pair<Rule>) -> Result<Expr> {
fn unquote(s: &str) -> String { fold_left_binop(pair, build_mul_expr, |s| match s {
let s = s.trim(); "+" => Some(BinOp::Add),
if (s.starts_with('"') && s.ends_with('"')) || (s.starts_with('|') && s.ends_with('|')) { "-" => Some(BinOp::Sub),
s[1..s.len() - 1].to_string() _ => None,
})
}
fn build_mul_expr(pair: Pair<Rule>) -> Result<Expr> {
fold_left_binop(pair, build_pow_expr, |s| match s {
"*" => Some(BinOp::Mul),
"/" => Some(BinOp::Div),
_ => None,
})
}
fn build_pow_expr(pair: Pair<Rule>) -> Result<Expr> {
// pow_expr = { unary ~ (pow_op ~ unary)? }
let mut pairs = pair.into_inner();
let base_pair = pairs
.next()
.ok_or_else(|| anyhow!("empty pow_expr"))?;
let base = build_unary(base_pair)?;
match pairs.next() {
None => Ok(base),
Some(op_pair) => {
debug_assert_eq!(op_pair.as_rule(), Rule::pow_op);
let exp_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing exponent in pow_expr"))?;
let exp = build_unary(exp_pair)?;
Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)))
}
}
}
fn build_unary(pair: Pair<Rule>) -> Result<Expr> {
// unary = { unary_minus | primary }
let inner = first_inner(pair, "unary")?;
match inner.as_rule() {
Rule::unary_minus => {
let prim = first_inner(inner, "unary_minus")?;
Ok(Expr::UnaryMinus(Box::new(build_primary(prim)?)))
}
Rule::primary => build_primary(inner),
r => Err(anyhow!("unexpected rule in unary: {:?}", r)),
}
}
fn build_primary(pair: Pair<Rule>) -> Result<Expr> {
// primary = { number | agg_call | if_expr | paren_expr | ref_expr }
let inner = first_inner(pair, "primary")?;
match inner.as_rule() {
Rule::number => Ok(Expr::Number(inner.as_str().parse()?)),
Rule::agg_call => build_agg_call(inner),
Rule::if_expr => build_if_expr(inner),
Rule::paren_expr => build_expr(first_inner(inner, "paren_expr")?),
Rule::ref_expr => {
let id_pair = first_inner(inner, "ref_expr")?;
Ok(Expr::Ref(identifier_to_string(id_pair)))
}
r => Err(anyhow!("unexpected rule in primary: {:?}", r)),
}
}
fn build_agg_call(pair: Pair<Rule>) -> Result<Expr> {
// agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
let mut pairs = pair.into_inner();
let func_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing agg_func"))?;
let func = parse_agg_func(func_pair.as_str())?;
let expr_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing aggregate argument"))?;
let inner_expr = build_expr(expr_pair)?;
let filter = match pairs.next() {
Some(p) if p.as_rule() == Rule::inline_where => Some(build_filter(p)?),
_ => None,
};
Ok(Expr::Agg(func, Box::new(inner_expr), filter))
}
fn parse_agg_func(s: &str) -> Result<AggFunc> {
match s.to_ascii_uppercase().as_str() {
"SUM" => Ok(AggFunc::Sum),
"AVG" => Ok(AggFunc::Avg),
"MIN" => Ok(AggFunc::Min),
"MAX" => Ok(AggFunc::Max),
"COUNT" => Ok(AggFunc::Count),
f => Err(anyhow!("unknown aggregate function: {}", f)),
}
}
fn build_if_expr(pair: Pair<Rule>) -> Result<Expr> {
// if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
let mut pairs = pair.into_inner();
let cond_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing IF condition"))?;
let cond = build_comparison(cond_pair)?;
let then_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing IF then-branch"))?;
let then_e = build_expr(then_pair)?;
let else_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing IF else-branch"))?;
let else_e = build_expr(else_pair)?;
Ok(Expr::If(
Box::new(cond),
Box::new(then_e),
Box::new(else_e),
))
}
fn build_comparison(pair: Pair<Rule>) -> Result<Expr> {
// comparison = { expr ~ cmp_op ~ expr }
let mut pairs = pair.into_inner();
let lhs_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing comparison lhs"))?;
let lhs = build_expr(lhs_pair)?;
let op_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing comparison operator"))?;
let op = parse_cmp_op(op_pair.as_str())?;
let rhs_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing comparison rhs"))?;
let rhs = build_expr(rhs_pair)?;
Ok(Expr::BinOp(op, Box::new(lhs), Box::new(rhs)))
}
fn parse_cmp_op(s: &str) -> Result<BinOp> {
match s {
"=" => Ok(BinOp::Eq),
"!=" => Ok(BinOp::Ne),
"<" => Ok(BinOp::Lt),
">" => Ok(BinOp::Gt),
"<=" => Ok(BinOp::Le),
">=" => Ok(BinOp::Ge),
o => Err(anyhow!("unknown comparison operator: {}", o)),
}
}
fn build_filter(pair: Pair<Rule>) -> Result<Filter> {
// where_clause / inline_where both have shape:
// ^"WHERE" ~ identifier ~ "=" ~ filter_value
let mut pairs = pair.into_inner();
let cat_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing WHERE category"))?;
let category = identifier_to_string(cat_pair);
let val_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing WHERE value"))?;
let item = filter_value_to_string(val_pair);
Ok(Filter { category, item })
}
fn filter_value_to_string(pair: Pair<Rule>) -> String {
// filter_value = { string | pipe_quoted | bare_ident }
let inner = pair
.into_inner()
.next()
.expect("filter_value must have an inner pair");
let s = inner.as_str();
match inner.as_rule() {
Rule::string => strip_string_quotes(s),
Rule::pipe_quoted => unquote_pipe(s),
_ => s.to_string(),
}
}
/// Convert an identifier pair (identifier, pipe_quoted, or bare_ident) to
/// its content string. Pipe-quoted identifiers have their delimiters
/// stripped and backslash escapes applied; bare identifiers are returned
/// verbatim.
fn identifier_to_string(pair: Pair<Rule>) -> String {
let s = pair.as_str();
if is_pipe_quoted(s) {
unquote_pipe(s)
} else { } else {
s.to_string() s.to_string()
} }
} }
fn parse_where(s: &str) -> Result<Filter> { fn is_pipe_quoted(s: &str) -> bool {
// Format: Category = "Item" or Category = |Item| or Category = Item s.len() >= 2 && s.starts_with('|') && s.ends_with('|')
let eq_pos = s
.find('=')
.ok_or_else(|| anyhow!("WHERE clause must contain '=': {s}"))?;
let category = unquote(&s[..eq_pos]);
let item = unquote(&s[eq_pos + 1..]);
Ok(Filter { category, item })
} }
/// Parse an expression using recursive descent fn strip_string_quotes(s: &str) -> String {
pub fn parse_expr(s: &str) -> Result<Expr> { debug_assert!(s.len() >= 2 && s.starts_with('"') && s.ends_with('"'));
let tokens = tokenize(s)?; s[1..s.len() - 1].to_string()
let mut pos = 0;
let expr = parse_add_sub(&tokens, &mut pos)?;
if pos < tokens.len() {
return Err(anyhow!(
"Unexpected token at position {pos}: {:?}",
tokens[pos]
));
}
Ok(expr)
} }
#[derive(Debug, Clone, PartialEq)] /// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
enum Token { /// `\\` → `\`, `\n` → newline. Matches the escape semantics documented
Number(f64), /// in src/persistence/improv.pest.
Ident(String), fn unquote_pipe(s: &str) -> String {
Str(String), debug_assert!(is_pipe_quoted(s));
Plus, let inner = &s[1..s.len() - 1];
Minus, let mut out = String::with_capacity(inner.len());
Star, let mut chars = inner.chars();
Slash, while let Some(c) = chars.next() {
Caret, if c == '\\' {
LParen, match chars.next() {
RParen, Some('|') => out.push('|'),
Comma, Some('\\') => out.push('\\'),
Eq, Some('n') => out.push('\n'),
Ne, Some(other) => {
Lt, out.push('\\');
Gt, out.push(other);
Le,
Ge,
}
fn tokenize(s: &str) -> Result<Vec<Token>> {
let mut tokens = Vec::new();
let chars: Vec<char> = s.chars().collect();
let mut i = 0;
while i < chars.len() {
match chars[i] {
' ' | '\t' | '\n' => i += 1,
'+' => {
tokens.push(Token::Plus);
i += 1;
}
'-' => {
tokens.push(Token::Minus);
i += 1;
}
'*' => {
tokens.push(Token::Star);
i += 1;
}
'/' => {
tokens.push(Token::Slash);
i += 1;
}
'^' => {
tokens.push(Token::Caret);
i += 1;
}
'(' => {
tokens.push(Token::LParen);
i += 1;
}
')' => {
tokens.push(Token::RParen);
i += 1;
}
',' => {
tokens.push(Token::Comma);
i += 1;
}
'!' if chars.get(i + 1) == Some(&'=') => {
tokens.push(Token::Ne);
i += 2;
}
'<' if chars.get(i + 1) == Some(&'=') => {
tokens.push(Token::Le);
i += 2;
}
'>' if chars.get(i + 1) == Some(&'=') => {
tokens.push(Token::Ge);
i += 2;
}
'<' => {
tokens.push(Token::Lt);
i += 1;
}
'>' => {
tokens.push(Token::Gt);
i += 1;
}
'=' => {
tokens.push(Token::Eq);
i += 1;
}
'"' => {
i += 1;
let mut s = String::new();
while i < chars.len() && chars[i] != '"' {
s.push(chars[i]);
i += 1;
} }
if i < chars.len() { None => out.push('\\'),
i += 1;
}
tokens.push(Token::Str(s));
} }
'|' => { } else {
i += 1; out.push(c);
let mut s = String::new();
while i < chars.len() && chars[i] != '|' {
s.push(chars[i]);
i += 1;
}
if i < chars.len() {
i += 1;
}
tokens.push(Token::Ident(s));
}
c if c.is_ascii_digit() || c == '.' => {
let mut num = String::new();
while i < chars.len() && (chars[i].is_ascii_digit() || chars[i] == '.') {
num.push(chars[i]);
i += 1;
}
tokens.push(Token::Number(num.parse()?));
}
c if c.is_alphabetic() || c == '_' => {
let mut ident = String::new();
while i < chars.len()
&& (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == ' ')
{
// Don't consume trailing spaces if next non-space is operator
if chars[i] == ' ' {
// Peek ahead past spaces to find the next word/token
let j = i + 1;
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
if matches!(
next_nonspace,
Some('+')
| Some('-')
| Some('*')
| Some('/')
| Some('^')
| Some(')')
| Some(',')
| Some('<')
| Some('>')
| Some('=')
| Some('!')
| Some('"')
| None
) {
break;
}
// Break if the identifier collected so far is a keyword
let trimmed = ident.trim_end().to_ascii_uppercase();
if matches!(
trimmed.as_str(),
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
) {
break;
}
// Also break if the next word is a keyword
let rest: String = chars[j..].iter().collect();
let next_word: String = rest
.trim_start()
.chars()
.take_while(|c| c.is_alphanumeric() || *c == '_')
.collect();
let upper = next_word.to_ascii_uppercase();
if matches!(
upper.as_str(),
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
) {
break;
}
}
ident.push(chars[i]);
i += 1;
}
let ident = ident.trim_end().to_string();
tokens.push(Token::Ident(ident));
}
c => return Err(anyhow!("Unexpected character '{c}' in expression")),
} }
} }
Ok(tokens) out
} }
fn parse_add_sub(tokens: &[Token], pos: &mut usize) -> Result<Expr> { // ---- small helpers ------------------------------------------------------
let mut left = parse_mul_div(tokens, pos)?;
while *pos < tokens.len() { fn first_inner<'a>(pair: Pair<'a, Rule>, ctx: &str) -> Result<Pair<'a, Rule>> {
let op = match &tokens[*pos] { pair.into_inner()
Token::Plus => BinOp::Add, .next()
Token::Minus => BinOp::Sub, .ok_or_else(|| anyhow!("empty rule: {}", ctx))
_ => break, }
};
*pos += 1; /// Fold a left-associative binary-operator rule of the shape
let right = parse_mul_div(tokens, pos)?; /// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Result<Expr>
where
F: FnMut(Pair<Rule>) -> Result<Expr>,
M: Fn(&str) -> Option<BinOp>,
{
let mut pairs = pair.into_inner();
let first = pairs
.next()
.ok_or_else(|| anyhow!("empty binop rule"))?;
let mut left = build_child(first)?;
while let Some(op_pair) = pairs.next() {
let op = match_op(op_pair.as_str()).ok_or_else(|| {
anyhow!("unexpected operator token: {:?}", op_pair.as_str())
})?;
let right_pair = pairs
.next()
.ok_or_else(|| anyhow!("missing rhs for operator"))?;
let right = build_child(right_pair)?;
left = Expr::BinOp(op, Box::new(left), Box::new(right)); left = Expr::BinOp(op, Box::new(left), Box::new(right));
} }
Ok(left) Ok(left)
} }
fn parse_mul_div(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
let mut left = parse_pow(tokens, pos)?;
while *pos < tokens.len() {
let op = match &tokens[*pos] {
Token::Star => BinOp::Mul,
Token::Slash => BinOp::Div,
_ => break,
};
*pos += 1;
let right = parse_pow(tokens, pos)?;
left = Expr::BinOp(op, Box::new(left), Box::new(right));
}
Ok(left)
}
fn parse_pow(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
let base = parse_unary(tokens, pos)?;
if *pos < tokens.len() && tokens[*pos] == Token::Caret {
*pos += 1;
let exp = parse_unary(tokens, pos)?;
return Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)));
}
Ok(base)
}
fn parse_unary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
if *pos < tokens.len() && tokens[*pos] == Token::Minus {
*pos += 1;
let e = parse_primary(tokens, pos)?;
return Ok(Expr::UnaryMinus(Box::new(e)));
}
parse_primary(tokens, pos)
}
fn parse_primary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
if *pos >= tokens.len() {
return Err(anyhow!("Unexpected end of expression"));
}
match &tokens[*pos].clone() {
Token::Number(n) => {
*pos += 1;
Ok(Expr::Number(*n))
}
Token::Ident(name) => {
let name = name.clone();
*pos += 1;
// Check for function call
let upper = name.to_ascii_uppercase();
match upper.as_str() {
"SUM" | "AVG" | "MIN" | "MAX" | "COUNT" => {
let func = match upper.as_str() {
"SUM" => AggFunc::Sum,
"AVG" => AggFunc::Avg,
"MIN" => AggFunc::Min,
"MAX" => AggFunc::Max,
"COUNT" => AggFunc::Count,
_ => unreachable!(),
};
if *pos < tokens.len() && tokens[*pos] == Token::LParen {
*pos += 1;
let inner = parse_add_sub(tokens, pos)?;
// Optional WHERE filter
let filter = if *pos < tokens.len() {
if let Token::Ident(kw) = &tokens[*pos] {
if kw.eq_ignore_ascii_case("WHERE") {
*pos += 1;
let cat = match &tokens[*pos] {
Token::Ident(s) => {
let s = s.clone();
*pos += 1;
s
}
t => {
return Err(anyhow!(
"Expected category name, got {t:?}"
));
}
};
// expect =
if *pos < tokens.len() && tokens[*pos] == Token::Eq {
*pos += 1;
}
let item = match &tokens[*pos] {
Token::Str(s) | Token::Ident(s) => {
let s = s.clone();
*pos += 1;
s
}
t => return Err(anyhow!("Expected item name, got {t:?}")),
};
Some(Filter {
category: cat,
item,
})
} else {
None
}
} else {
None
}
} else {
None
};
// expect )
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
*pos += 1;
} else {
return Err(anyhow!("Expected ')' to close aggregate function"));
}
return Ok(Expr::Agg(func, Box::new(inner), filter));
}
Ok(Expr::Ref(name))
}
"IF" => {
if *pos < tokens.len() && tokens[*pos] == Token::LParen {
*pos += 1;
let cond = parse_comparison(tokens, pos)?;
if *pos < tokens.len() && tokens[*pos] == Token::Comma {
*pos += 1;
}
let then = parse_add_sub(tokens, pos)?;
if *pos < tokens.len() && tokens[*pos] == Token::Comma {
*pos += 1;
}
let else_ = parse_add_sub(tokens, pos)?;
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
*pos += 1;
} else {
return Err(anyhow!("Expected ')' to close IF(...)"));
}
return Ok(Expr::If(Box::new(cond), Box::new(then), Box::new(else_)));
}
Ok(Expr::Ref(name))
}
_ => Ok(Expr::Ref(name)),
}
}
Token::LParen => {
*pos += 1;
let e = parse_add_sub(tokens, pos)?;
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
*pos += 1;
}
Ok(e)
}
t => Err(anyhow!("Unexpected token in expression: {t:?}")),
}
}
fn parse_comparison(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
let left = parse_add_sub(tokens, pos)?;
if *pos >= tokens.len() {
return Ok(left);
}
let op = match &tokens[*pos] {
Token::Eq => BinOp::Eq,
Token::Ne => BinOp::Ne,
Token::Lt => BinOp::Lt,
Token::Gt => BinOp::Gt,
Token::Le => BinOp::Le,
Token::Ge => BinOp::Ge,
_ => return Ok(left),
};
*pos += 1;
let right = parse_add_sub(tokens, pos)?;
Ok(Expr::BinOp(op, Box::new(left), Box::new(right)))
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::parse_formula; use super::parse_formula;
@ -544,8 +403,8 @@ mod tests {
assert_eq!(filter.item, "East"); assert_eq!(filter.item, "East");
} }
/// Regression: WHERE inside aggregate parens must tokenize correctly. /// Regression: WHERE inside aggregate parens must parse as the
/// The tokenizer must not merge "Revenue WHERE" into a single identifier. /// aggregate's inline filter, not as a top-level WHERE clause.
#[test] #[test]
fn parse_sum_with_inline_where_filter() { fn parse_sum_with_inline_where_filter() {
let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap(); let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap();
@ -562,7 +421,6 @@ mod tests {
#[test] #[test]
fn parse_if_with_comparison_operators() { fn parse_if_with_comparison_operators() {
// Test each comparison operator in an IF expression
let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap(); let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap();
assert!(matches!(f.expr, Expr::If(_, _, _))); assert!(matches!(f.expr, Expr::If(_, _, _)));
@ -583,7 +441,6 @@ mod tests {
#[test] #[test]
fn parse_where_with_quoted_string_inside_expression() { fn parse_where_with_quoted_string_inside_expression() {
// WHERE inside a formula string with quotes
let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap(); let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap();
let filter = f.filter.as_ref().unwrap(); let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "West Coast"); assert_eq!(filter.item, "West Coast");
@ -650,11 +507,10 @@ mod tests {
} }
} }
// ── Quoted string in tokenizer ────────────────────────────────────── // ── Quoted string in WHERE ──────────────────────────────────────────
#[test] #[test]
fn parse_quoted_string_in_where() { fn parse_quoted_string_in_where() {
// Quoted strings work in top-level WHERE clauses
let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap(); let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap();
let filter = f.filter.as_ref().unwrap(); let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "East"); assert_eq!(filter.item, "East");
@ -681,27 +537,21 @@ mod tests {
assert!(parse_expr("").is_err()); assert!(parse_expr("").is_err());
} }
// ── Multi-word identifiers must be pipe-quoted ──────────────────────
#[test] #[test]
fn tokenizer_breaks_at_where_keyword() { fn multi_word_bare_identifier_is_rejected() {
use super::tokenize; // Multi-word identifiers must be pipe-quoted; bare multi-word fails
let tokens = tokenize("Revenue WHERE Region").unwrap(); // the `bare_name`-compatible grammar rule.
// Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region") assert!(parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").is_err());
assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
} }
// ── Multi-word identifiers ────────────────────────────────────────── // ── WHERE inside quotes in the expression ───────────────────────────
#[test] #[test]
fn parse_multi_word_identifier() { fn where_inside_quotes_is_not_a_keyword() {
let f = parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").unwrap(); // A filter value containing the literal text "WHERE" is parsed as
assert_eq!(f.target, "Total Revenue"); // a string, not as a nested WHERE keyword.
}
// ── WHERE inside quotes in split_where ──────────────────────────────
#[test]
fn split_where_ignores_where_inside_quotes() {
// WHERE inside quotes should not be treated as a keyword
let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap(); let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap();
let filter = f.filter.as_ref().unwrap(); let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "WHERE"); assert_eq!(filter.item, "WHERE");
@ -773,4 +623,17 @@ mod tests {
panic!("Expected SUM with WHERE filter, got: {:?}", f.expr); panic!("Expected SUM with WHERE filter, got: {:?}", f.expr);
} }
} }
// ── Pipe-quoted escape semantics ────────────────────────────────────
#[test]
fn pipe_quoted_escape_literal_pipe() {
// \| inside a pipe-quoted identifier is a literal pipe
let f = parse_formula("X = |A\\|B|", "Cat").unwrap();
if let Expr::Ref(ref s) = f.expr {
assert_eq!(s, "A|B");
} else {
panic!("Expected Ref, got: {:?}", f.expr);
}
}
} }