refactor!(formula): migrate parser to use pest
Replace the manual tokenizer and recursive descent parser with a PEG grammar using the pest library. This migration involves introducing a formal grammar in formula.pest and updating the parser implementation to utilize the generated Pest parser with a tree-walking approach to construct the AST. The change introduces a stricter requirement for identifiers: multi-word identifiers must now be enclosed in pipe quotes (e.g., |Total Revenue|) and are no longer accepted as bare words. Tests have been updated to reflect the new parsing logic, remove tokenizer-specific tests, and verify the new pipe-quoting and escape semantics. BREAKING CHANGE: Multi-word identifiers now require pipe-quoting (e.g. |Total Revenue|) and are no longer accepted as bare words. Co-Authored-By: fiddlerwoaroof/git-smart-commit (gemma-4-31B-it-UD-Q4_K_XL.gguf)
This commit is contained in:
@ -8,4 +8,6 @@ repository = "https://github.com/fiddlerwoaroof/improvise"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1"
|
||||
pest = "2.8.6"
|
||||
pest_derive = "2.8.6"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
|
||||
91
crates/improvise-formula/src/formula.pest
Normal file
91
crates/improvise-formula/src/formula.pest
Normal file
@ -0,0 +1,91 @@
|
||||
// Formula grammar for improvise.
|
||||
//
|
||||
// A formula has the form: TARGET = EXPR [WHERE filter]
|
||||
// See parser.rs for the tree walker that produces a Formula AST.
|
||||
//
|
||||
// Identifier rules (bare_ident / pipe_quoted) mirror `bare_name` and
|
||||
// `pipe_quoted` in src/persistence/improv.pest: bare identifiers are
|
||||
// alphanumeric plus `_` and `-`, with no internal spaces; multi-word
|
||||
// names must be pipe-quoted.
|
||||
|
||||
// Auto-skip horizontal whitespace between tokens in non-atomic rules.
|
||||
WHITESPACE = _{ " " | "\t" }
|
||||
|
||||
// ---- top-level ----------------------------------------------------------
|
||||
|
||||
formula = { SOI ~ target ~ "=" ~ expr ~ where_clause? ~ EOI }
|
||||
|
||||
// The target keeps its raw text (including pipes, if any) — we capture
|
||||
// the span directly rather than walking into its children.
|
||||
target = { identifier }
|
||||
|
||||
where_clause = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
|
||||
|
||||
// ---- expressions --------------------------------------------------------
|
||||
|
||||
// Used by parse_expr() — forces a standalone expression to consume the
|
||||
// whole input, so `1 + 2 3` fails instead of silently dropping " 3".
|
||||
expr_eoi = { SOI ~ expr ~ EOI }
|
||||
|
||||
expr = { add_expr }
|
||||
|
||||
add_expr = { mul_expr ~ (add_op ~ mul_expr)* }
|
||||
add_op = { "+" | "-" }
|
||||
|
||||
mul_expr = { pow_expr ~ (mul_op ~ pow_expr)* }
|
||||
mul_op = { "*" | "/" }
|
||||
|
||||
pow_expr = { unary ~ (pow_op ~ unary)? }
|
||||
pow_op = { "^" }
|
||||
|
||||
unary = { unary_minus | primary }
|
||||
unary_minus = { "-" ~ primary }
|
||||
|
||||
primary = {
|
||||
number
|
||||
| agg_call
|
||||
| if_expr
|
||||
| paren_expr
|
||||
| ref_expr
|
||||
}
|
||||
|
||||
paren_expr = { "(" ~ expr ~ ")" }
|
||||
|
||||
// Aggregates with optional inline WHERE filter inside the parens.
|
||||
agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
|
||||
agg_func = { ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" | ^"COUNT" }
|
||||
inline_where = { ^"WHERE" ~ identifier ~ "=" ~ filter_value }
|
||||
|
||||
// IF(cond, then, else). Comparison is a standalone rule because comparison
|
||||
// operators are not valid in general expressions — only inside an IF condition.
|
||||
if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
|
||||
comparison = { expr ~ cmp_op ~ expr }
|
||||
cmp_op = { "!=" | "<=" | ">=" | "<" | ">" | "=" }
|
||||
|
||||
// A reference to an item. `SUM` and `IF` without parens fall through to
|
||||
// this rule because agg_call / if_expr require a "(" and otherwise fail.
|
||||
ref_expr = { identifier }
|
||||
|
||||
// ---- identifiers --------------------------------------------------------
|
||||
//
|
||||
// Mirror of improv.pest's bare_name / pipe_quoted.
|
||||
|
||||
identifier = ${ pipe_quoted | bare_ident }
|
||||
|
||||
// Backslash escapes inside pipes: \| literal pipe, \\ backslash, \n newline.
|
||||
pipe_quoted = @{ "|" ~ ("\\" ~ ANY | !"|" ~ ANY)* ~ "|" }
|
||||
|
||||
bare_ident = @{
|
||||
(ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")*
|
||||
}
|
||||
|
||||
// ---- literal values -----------------------------------------------------
|
||||
|
||||
filter_value = { string | pipe_quoted | bare_ident }
|
||||
|
||||
string = @{ "\"" ~ (!"\"" ~ ANY)* ~ "\"" }
|
||||
|
||||
number = @{
|
||||
ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT*)?
|
||||
| "." ~ ASCII_DIGIT+
|
||||
}
|
||||
@ -1,462 +1,321 @@
|
||||
use anyhow::{Result, anyhow};
|
||||
use pest::Parser as _;
|
||||
use pest::iterators::Pair;
|
||||
use pest_derive::Parser;
|
||||
|
||||
use super::ast::{AggFunc, BinOp, Expr, Filter, Formula};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[grammar = "formula.pest"]
|
||||
struct FormulaParser;
|
||||
|
||||
/// Parse a formula string like "Profit = Revenue - Cost"
|
||||
/// or "Tax = Revenue * 0.08 WHERE Region = \"East\""
|
||||
pub fn parse_formula(raw: &str, target_category: &str) -> Result<Formula> {
|
||||
let raw = raw.trim();
|
||||
let input = raw.trim();
|
||||
let formula_pair = FormulaParser::parse(Rule::formula, input)
|
||||
.map_err(|e| anyhow!("{}", e))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("empty parse result"))?;
|
||||
build_formula(formula_pair, input, target_category)
|
||||
}
|
||||
|
||||
// Split on first `=` to get target = expression
|
||||
let eq_pos = raw
|
||||
.find('=')
|
||||
.ok_or_else(|| anyhow!("Formula must contain '=': {raw}"))?;
|
||||
let target = raw[..eq_pos].trim().to_string();
|
||||
let rest = raw[eq_pos + 1..].trim();
|
||||
/// Parse a bare expression (no target, no top-level WHERE clause).
|
||||
/// Fails if the input contains trailing tokens after a complete expression.
|
||||
pub fn parse_expr(s: &str) -> Result<Expr> {
|
||||
let input = s.trim();
|
||||
let expr_pair = FormulaParser::parse(Rule::expr_eoi, input)
|
||||
.map_err(|e| anyhow!("{}", e))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("empty parse result"))?
|
||||
.into_inner()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing expression in expr_eoi"))?;
|
||||
build_expr(expr_pair)
|
||||
}
|
||||
|
||||
// Check for WHERE clause at top level
|
||||
let (expr_str, filter) = split_where(rest);
|
||||
let filter = filter.map(parse_where).transpose()?;
|
||||
|
||||
let expr = parse_expr(expr_str.trim())?;
|
||||
// ---- tree walkers -------------------------------------------------------
|
||||
|
||||
fn build_formula(pair: Pair<Rule>, raw: &str, target_category: &str) -> Result<Formula> {
|
||||
let mut target = None;
|
||||
let mut expr = None;
|
||||
let mut filter = None;
|
||||
for inner in pair.into_inner() {
|
||||
match inner.as_rule() {
|
||||
Rule::target => target = Some(inner.as_str().trim().to_string()),
|
||||
Rule::expr => expr = Some(build_expr(inner)?),
|
||||
Rule::where_clause => filter = Some(build_filter(inner)?),
|
||||
Rule::EOI => {}
|
||||
r => return Err(anyhow!("unexpected rule in formula: {:?}", r)),
|
||||
}
|
||||
}
|
||||
let target = target.ok_or_else(|| anyhow!("missing target in formula"))?;
|
||||
let expr = expr.ok_or_else(|| anyhow!("missing expression in formula"))?;
|
||||
Ok(Formula::new(raw, target, target_category, expr, filter))
|
||||
}
|
||||
|
||||
fn split_where(s: &str) -> (&str, Option<&str>) {
|
||||
// Find WHERE not inside parens or quotes
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth = 0i32;
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
match bytes[i] {
|
||||
b'(' => depth += 1,
|
||||
b')' => depth -= 1,
|
||||
b'"' => {
|
||||
i += 1;
|
||||
while i < bytes.len() && bytes[i] != b'"' {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
b'|' => {
|
||||
i += 1;
|
||||
while i < bytes.len() && bytes[i] != b'|' {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
_ if depth == 0 => {
|
||||
if s[i..].to_ascii_uppercase().starts_with("WHERE") {
|
||||
let before = &s[..i];
|
||||
let after = &s[i + 5..];
|
||||
if before.ends_with(char::is_whitespace) || i == 0 {
|
||||
return (before.trim(), Some(after.trim()));
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
(s, None)
|
||||
fn build_expr(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// expr = { add_expr }
|
||||
build_add_expr(first_inner(pair, "expr")?)
|
||||
}
|
||||
|
||||
/// Strip pipe or double-quote delimiters from a value.
|
||||
fn unquote(s: &str) -> String {
|
||||
let s = s.trim();
|
||||
if (s.starts_with('"') && s.ends_with('"')) || (s.starts_with('|') && s.ends_with('|')) {
|
||||
s[1..s.len() - 1].to_string()
|
||||
fn build_add_expr(pair: Pair<Rule>) -> Result<Expr> {
|
||||
fold_left_binop(pair, build_mul_expr, |s| match s {
|
||||
"+" => Some(BinOp::Add),
|
||||
"-" => Some(BinOp::Sub),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_mul_expr(pair: Pair<Rule>) -> Result<Expr> {
|
||||
fold_left_binop(pair, build_pow_expr, |s| match s {
|
||||
"*" => Some(BinOp::Mul),
|
||||
"/" => Some(BinOp::Div),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_pow_expr(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// pow_expr = { unary ~ (pow_op ~ unary)? }
|
||||
let mut pairs = pair.into_inner();
|
||||
let base_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("empty pow_expr"))?;
|
||||
let base = build_unary(base_pair)?;
|
||||
match pairs.next() {
|
||||
None => Ok(base),
|
||||
Some(op_pair) => {
|
||||
debug_assert_eq!(op_pair.as_rule(), Rule::pow_op);
|
||||
let exp_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing exponent in pow_expr"))?;
|
||||
let exp = build_unary(exp_pair)?;
|
||||
Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_unary(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// unary = { unary_minus | primary }
|
||||
let inner = first_inner(pair, "unary")?;
|
||||
match inner.as_rule() {
|
||||
Rule::unary_minus => {
|
||||
let prim = first_inner(inner, "unary_minus")?;
|
||||
Ok(Expr::UnaryMinus(Box::new(build_primary(prim)?)))
|
||||
}
|
||||
Rule::primary => build_primary(inner),
|
||||
r => Err(anyhow!("unexpected rule in unary: {:?}", r)),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_primary(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// primary = { number | agg_call | if_expr | paren_expr | ref_expr }
|
||||
let inner = first_inner(pair, "primary")?;
|
||||
match inner.as_rule() {
|
||||
Rule::number => Ok(Expr::Number(inner.as_str().parse()?)),
|
||||
Rule::agg_call => build_agg_call(inner),
|
||||
Rule::if_expr => build_if_expr(inner),
|
||||
Rule::paren_expr => build_expr(first_inner(inner, "paren_expr")?),
|
||||
Rule::ref_expr => {
|
||||
let id_pair = first_inner(inner, "ref_expr")?;
|
||||
Ok(Expr::Ref(identifier_to_string(id_pair)))
|
||||
}
|
||||
r => Err(anyhow!("unexpected rule in primary: {:?}", r)),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_agg_call(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// agg_call = { agg_func ~ "(" ~ expr ~ inline_where? ~ ")" }
|
||||
let mut pairs = pair.into_inner();
|
||||
let func_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing agg_func"))?;
|
||||
let func = parse_agg_func(func_pair.as_str())?;
|
||||
let expr_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing aggregate argument"))?;
|
||||
let inner_expr = build_expr(expr_pair)?;
|
||||
let filter = match pairs.next() {
|
||||
Some(p) if p.as_rule() == Rule::inline_where => Some(build_filter(p)?),
|
||||
_ => None,
|
||||
};
|
||||
Ok(Expr::Agg(func, Box::new(inner_expr), filter))
|
||||
}
|
||||
|
||||
fn parse_agg_func(s: &str) -> Result<AggFunc> {
|
||||
match s.to_ascii_uppercase().as_str() {
|
||||
"SUM" => Ok(AggFunc::Sum),
|
||||
"AVG" => Ok(AggFunc::Avg),
|
||||
"MIN" => Ok(AggFunc::Min),
|
||||
"MAX" => Ok(AggFunc::Max),
|
||||
"COUNT" => Ok(AggFunc::Count),
|
||||
f => Err(anyhow!("unknown aggregate function: {}", f)),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_if_expr(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// if_expr = { ^"IF" ~ "(" ~ comparison ~ "," ~ expr ~ "," ~ expr ~ ")" }
|
||||
let mut pairs = pair.into_inner();
|
||||
let cond_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing IF condition"))?;
|
||||
let cond = build_comparison(cond_pair)?;
|
||||
let then_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing IF then-branch"))?;
|
||||
let then_e = build_expr(then_pair)?;
|
||||
let else_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing IF else-branch"))?;
|
||||
let else_e = build_expr(else_pair)?;
|
||||
Ok(Expr::If(
|
||||
Box::new(cond),
|
||||
Box::new(then_e),
|
||||
Box::new(else_e),
|
||||
))
|
||||
}
|
||||
|
||||
fn build_comparison(pair: Pair<Rule>) -> Result<Expr> {
|
||||
// comparison = { expr ~ cmp_op ~ expr }
|
||||
let mut pairs = pair.into_inner();
|
||||
let lhs_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing comparison lhs"))?;
|
||||
let lhs = build_expr(lhs_pair)?;
|
||||
let op_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing comparison operator"))?;
|
||||
let op = parse_cmp_op(op_pair.as_str())?;
|
||||
let rhs_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing comparison rhs"))?;
|
||||
let rhs = build_expr(rhs_pair)?;
|
||||
Ok(Expr::BinOp(op, Box::new(lhs), Box::new(rhs)))
|
||||
}
|
||||
|
||||
fn parse_cmp_op(s: &str) -> Result<BinOp> {
|
||||
match s {
|
||||
"=" => Ok(BinOp::Eq),
|
||||
"!=" => Ok(BinOp::Ne),
|
||||
"<" => Ok(BinOp::Lt),
|
||||
">" => Ok(BinOp::Gt),
|
||||
"<=" => Ok(BinOp::Le),
|
||||
">=" => Ok(BinOp::Ge),
|
||||
o => Err(anyhow!("unknown comparison operator: {}", o)),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_filter(pair: Pair<Rule>) -> Result<Filter> {
|
||||
// where_clause / inline_where both have shape:
|
||||
// ^"WHERE" ~ identifier ~ "=" ~ filter_value
|
||||
let mut pairs = pair.into_inner();
|
||||
let cat_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing WHERE category"))?;
|
||||
let category = identifier_to_string(cat_pair);
|
||||
let val_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing WHERE value"))?;
|
||||
let item = filter_value_to_string(val_pair);
|
||||
Ok(Filter { category, item })
|
||||
}
|
||||
|
||||
fn filter_value_to_string(pair: Pair<Rule>) -> String {
|
||||
// filter_value = { string | pipe_quoted | bare_ident }
|
||||
let inner = pair
|
||||
.into_inner()
|
||||
.next()
|
||||
.expect("filter_value must have an inner pair");
|
||||
let s = inner.as_str();
|
||||
match inner.as_rule() {
|
||||
Rule::string => strip_string_quotes(s),
|
||||
Rule::pipe_quoted => unquote_pipe(s),
|
||||
_ => s.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert an identifier pair (identifier, pipe_quoted, or bare_ident) to
|
||||
/// its content string. Pipe-quoted identifiers have their delimiters
|
||||
/// stripped and backslash escapes applied; bare identifiers are returned
|
||||
/// verbatim.
|
||||
fn identifier_to_string(pair: Pair<Rule>) -> String {
|
||||
let s = pair.as_str();
|
||||
if is_pipe_quoted(s) {
|
||||
unquote_pipe(s)
|
||||
} else {
|
||||
s.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_where(s: &str) -> Result<Filter> {
|
||||
// Format: Category = "Item" or Category = |Item| or Category = Item
|
||||
let eq_pos = s
|
||||
.find('=')
|
||||
.ok_or_else(|| anyhow!("WHERE clause must contain '=': {s}"))?;
|
||||
let category = unquote(&s[..eq_pos]);
|
||||
let item = unquote(&s[eq_pos + 1..]);
|
||||
Ok(Filter { category, item })
|
||||
fn is_pipe_quoted(s: &str) -> bool {
|
||||
s.len() >= 2 && s.starts_with('|') && s.ends_with('|')
|
||||
}
|
||||
|
||||
/// Parse an expression using recursive descent
|
||||
pub fn parse_expr(s: &str) -> Result<Expr> {
|
||||
let tokens = tokenize(s)?;
|
||||
let mut pos = 0;
|
||||
let expr = parse_add_sub(&tokens, &mut pos)?;
|
||||
if pos < tokens.len() {
|
||||
return Err(anyhow!(
|
||||
"Unexpected token at position {pos}: {:?}",
|
||||
tokens[pos]
|
||||
));
|
||||
}
|
||||
Ok(expr)
|
||||
fn strip_string_quotes(s: &str) -> String {
|
||||
debug_assert!(s.len() >= 2 && s.starts_with('"') && s.ends_with('"'));
|
||||
s[1..s.len() - 1].to_string()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum Token {
|
||||
Number(f64),
|
||||
Ident(String),
|
||||
Str(String),
|
||||
Plus,
|
||||
Minus,
|
||||
Star,
|
||||
Slash,
|
||||
Caret,
|
||||
LParen,
|
||||
RParen,
|
||||
Comma,
|
||||
Eq,
|
||||
Ne,
|
||||
Lt,
|
||||
Gt,
|
||||
Le,
|
||||
Ge,
|
||||
}
|
||||
|
||||
fn tokenize(s: &str) -> Result<Vec<Token>> {
|
||||
let mut tokens = Vec::new();
|
||||
let chars: Vec<char> = s.chars().collect();
|
||||
let mut i = 0;
|
||||
|
||||
while i < chars.len() {
|
||||
match chars[i] {
|
||||
' ' | '\t' | '\n' => i += 1,
|
||||
'+' => {
|
||||
tokens.push(Token::Plus);
|
||||
i += 1;
|
||||
}
|
||||
'-' => {
|
||||
tokens.push(Token::Minus);
|
||||
i += 1;
|
||||
}
|
||||
'*' => {
|
||||
tokens.push(Token::Star);
|
||||
i += 1;
|
||||
}
|
||||
'/' => {
|
||||
tokens.push(Token::Slash);
|
||||
i += 1;
|
||||
}
|
||||
'^' => {
|
||||
tokens.push(Token::Caret);
|
||||
i += 1;
|
||||
}
|
||||
'(' => {
|
||||
tokens.push(Token::LParen);
|
||||
i += 1;
|
||||
}
|
||||
')' => {
|
||||
tokens.push(Token::RParen);
|
||||
i += 1;
|
||||
}
|
||||
',' => {
|
||||
tokens.push(Token::Comma);
|
||||
i += 1;
|
||||
}
|
||||
'!' if chars.get(i + 1) == Some(&'=') => {
|
||||
tokens.push(Token::Ne);
|
||||
i += 2;
|
||||
}
|
||||
'<' if chars.get(i + 1) == Some(&'=') => {
|
||||
tokens.push(Token::Le);
|
||||
i += 2;
|
||||
}
|
||||
'>' if chars.get(i + 1) == Some(&'=') => {
|
||||
tokens.push(Token::Ge);
|
||||
i += 2;
|
||||
}
|
||||
'<' => {
|
||||
tokens.push(Token::Lt);
|
||||
i += 1;
|
||||
}
|
||||
'>' => {
|
||||
tokens.push(Token::Gt);
|
||||
i += 1;
|
||||
}
|
||||
'=' => {
|
||||
tokens.push(Token::Eq);
|
||||
i += 1;
|
||||
}
|
||||
'"' => {
|
||||
i += 1;
|
||||
let mut s = String::new();
|
||||
while i < chars.len() && chars[i] != '"' {
|
||||
s.push(chars[i]);
|
||||
i += 1;
|
||||
/// Strip surrounding pipes and apply backslash escapes: `\|` → `|`,
|
||||
/// `\\` → `\`, `\n` → newline. Matches the escape semantics documented
|
||||
/// in src/persistence/improv.pest.
|
||||
fn unquote_pipe(s: &str) -> String {
|
||||
debug_assert!(is_pipe_quoted(s));
|
||||
let inner = &s[1..s.len() - 1];
|
||||
let mut out = String::with_capacity(inner.len());
|
||||
let mut chars = inner.chars();
|
||||
while let Some(c) = chars.next() {
|
||||
if c == '\\' {
|
||||
match chars.next() {
|
||||
Some('|') => out.push('|'),
|
||||
Some('\\') => out.push('\\'),
|
||||
Some('n') => out.push('\n'),
|
||||
Some(other) => {
|
||||
out.push('\\');
|
||||
out.push(other);
|
||||
}
|
||||
if i < chars.len() {
|
||||
i += 1;
|
||||
}
|
||||
tokens.push(Token::Str(s));
|
||||
None => out.push('\\'),
|
||||
}
|
||||
'|' => {
|
||||
i += 1;
|
||||
let mut s = String::new();
|
||||
while i < chars.len() && chars[i] != '|' {
|
||||
s.push(chars[i]);
|
||||
i += 1;
|
||||
}
|
||||
if i < chars.len() {
|
||||
i += 1;
|
||||
}
|
||||
tokens.push(Token::Ident(s));
|
||||
}
|
||||
c if c.is_ascii_digit() || c == '.' => {
|
||||
let mut num = String::new();
|
||||
while i < chars.len() && (chars[i].is_ascii_digit() || chars[i] == '.') {
|
||||
num.push(chars[i]);
|
||||
i += 1;
|
||||
}
|
||||
tokens.push(Token::Number(num.parse()?));
|
||||
}
|
||||
c if c.is_alphabetic() || c == '_' => {
|
||||
let mut ident = String::new();
|
||||
while i < chars.len()
|
||||
&& (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == ' ')
|
||||
{
|
||||
// Don't consume trailing spaces if next non-space is operator
|
||||
if chars[i] == ' ' {
|
||||
// Peek ahead past spaces to find the next word/token
|
||||
let j = i + 1;
|
||||
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
|
||||
if matches!(
|
||||
next_nonspace,
|
||||
Some('+')
|
||||
| Some('-')
|
||||
| Some('*')
|
||||
| Some('/')
|
||||
| Some('^')
|
||||
| Some(')')
|
||||
| Some(',')
|
||||
| Some('<')
|
||||
| Some('>')
|
||||
| Some('=')
|
||||
| Some('!')
|
||||
| Some('"')
|
||||
| None
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// Break if the identifier collected so far is a keyword
|
||||
let trimmed = ident.trim_end().to_ascii_uppercase();
|
||||
if matches!(
|
||||
trimmed.as_str(),
|
||||
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// Also break if the next word is a keyword
|
||||
let rest: String = chars[j..].iter().collect();
|
||||
let next_word: String = rest
|
||||
.trim_start()
|
||||
.chars()
|
||||
.take_while(|c| c.is_alphanumeric() || *c == '_')
|
||||
.collect();
|
||||
let upper = next_word.to_ascii_uppercase();
|
||||
if matches!(
|
||||
upper.as_str(),
|
||||
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||
) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ident.push(chars[i]);
|
||||
i += 1;
|
||||
}
|
||||
let ident = ident.trim_end().to_string();
|
||||
tokens.push(Token::Ident(ident));
|
||||
}
|
||||
c => return Err(anyhow!("Unexpected character '{c}' in expression")),
|
||||
} else {
|
||||
out.push(c);
|
||||
}
|
||||
}
|
||||
Ok(tokens)
|
||||
out
|
||||
}
|
||||
|
||||
fn parse_add_sub(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
let mut left = parse_mul_div(tokens, pos)?;
|
||||
while *pos < tokens.len() {
|
||||
let op = match &tokens[*pos] {
|
||||
Token::Plus => BinOp::Add,
|
||||
Token::Minus => BinOp::Sub,
|
||||
_ => break,
|
||||
};
|
||||
*pos += 1;
|
||||
let right = parse_mul_div(tokens, pos)?;
|
||||
// ---- small helpers ------------------------------------------------------
|
||||
|
||||
fn first_inner<'a>(pair: Pair<'a, Rule>, ctx: &str) -> Result<Pair<'a, Rule>> {
|
||||
pair.into_inner()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("empty rule: {}", ctx))
|
||||
}
|
||||
|
||||
/// Fold a left-associative binary-operator rule of the shape
|
||||
/// `rule = { child ~ (op ~ child)* }` into a left-leaning BinOp tree.
|
||||
fn fold_left_binop<F, M>(pair: Pair<Rule>, mut build_child: F, match_op: M) -> Result<Expr>
|
||||
where
|
||||
F: FnMut(Pair<Rule>) -> Result<Expr>,
|
||||
M: Fn(&str) -> Option<BinOp>,
|
||||
{
|
||||
let mut pairs = pair.into_inner();
|
||||
let first = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("empty binop rule"))?;
|
||||
let mut left = build_child(first)?;
|
||||
while let Some(op_pair) = pairs.next() {
|
||||
let op = match_op(op_pair.as_str()).ok_or_else(|| {
|
||||
anyhow!("unexpected operator token: {:?}", op_pair.as_str())
|
||||
})?;
|
||||
let right_pair = pairs
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing rhs for operator"))?;
|
||||
let right = build_child(right_pair)?;
|
||||
left = Expr::BinOp(op, Box::new(left), Box::new(right));
|
||||
}
|
||||
Ok(left)
|
||||
}
|
||||
|
||||
fn parse_mul_div(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
let mut left = parse_pow(tokens, pos)?;
|
||||
while *pos < tokens.len() {
|
||||
let op = match &tokens[*pos] {
|
||||
Token::Star => BinOp::Mul,
|
||||
Token::Slash => BinOp::Div,
|
||||
_ => break,
|
||||
};
|
||||
*pos += 1;
|
||||
let right = parse_pow(tokens, pos)?;
|
||||
left = Expr::BinOp(op, Box::new(left), Box::new(right));
|
||||
}
|
||||
Ok(left)
|
||||
}
|
||||
|
||||
fn parse_pow(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
let base = parse_unary(tokens, pos)?;
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::Caret {
|
||||
*pos += 1;
|
||||
let exp = parse_unary(tokens, pos)?;
|
||||
return Ok(Expr::BinOp(BinOp::Pow, Box::new(base), Box::new(exp)));
|
||||
}
|
||||
Ok(base)
|
||||
}
|
||||
|
||||
fn parse_unary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::Minus {
|
||||
*pos += 1;
|
||||
let e = parse_primary(tokens, pos)?;
|
||||
return Ok(Expr::UnaryMinus(Box::new(e)));
|
||||
}
|
||||
parse_primary(tokens, pos)
|
||||
}
|
||||
|
||||
fn parse_primary(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
if *pos >= tokens.len() {
|
||||
return Err(anyhow!("Unexpected end of expression"));
|
||||
}
|
||||
match &tokens[*pos].clone() {
|
||||
Token::Number(n) => {
|
||||
*pos += 1;
|
||||
Ok(Expr::Number(*n))
|
||||
}
|
||||
Token::Ident(name) => {
|
||||
let name = name.clone();
|
||||
*pos += 1;
|
||||
// Check for function call
|
||||
let upper = name.to_ascii_uppercase();
|
||||
match upper.as_str() {
|
||||
"SUM" | "AVG" | "MIN" | "MAX" | "COUNT" => {
|
||||
let func = match upper.as_str() {
|
||||
"SUM" => AggFunc::Sum,
|
||||
"AVG" => AggFunc::Avg,
|
||||
"MIN" => AggFunc::Min,
|
||||
"MAX" => AggFunc::Max,
|
||||
"COUNT" => AggFunc::Count,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::LParen {
|
||||
*pos += 1;
|
||||
let inner = parse_add_sub(tokens, pos)?;
|
||||
// Optional WHERE filter
|
||||
let filter = if *pos < tokens.len() {
|
||||
if let Token::Ident(kw) = &tokens[*pos] {
|
||||
if kw.eq_ignore_ascii_case("WHERE") {
|
||||
*pos += 1;
|
||||
let cat = match &tokens[*pos] {
|
||||
Token::Ident(s) => {
|
||||
let s = s.clone();
|
||||
*pos += 1;
|
||||
s
|
||||
}
|
||||
t => {
|
||||
return Err(anyhow!(
|
||||
"Expected category name, got {t:?}"
|
||||
));
|
||||
}
|
||||
};
|
||||
// expect =
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::Eq {
|
||||
*pos += 1;
|
||||
}
|
||||
let item = match &tokens[*pos] {
|
||||
Token::Str(s) | Token::Ident(s) => {
|
||||
let s = s.clone();
|
||||
*pos += 1;
|
||||
s
|
||||
}
|
||||
t => return Err(anyhow!("Expected item name, got {t:?}")),
|
||||
};
|
||||
Some(Filter {
|
||||
category: cat,
|
||||
item,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// expect )
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
|
||||
*pos += 1;
|
||||
} else {
|
||||
return Err(anyhow!("Expected ')' to close aggregate function"));
|
||||
}
|
||||
return Ok(Expr::Agg(func, Box::new(inner), filter));
|
||||
}
|
||||
Ok(Expr::Ref(name))
|
||||
}
|
||||
"IF" => {
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::LParen {
|
||||
*pos += 1;
|
||||
let cond = parse_comparison(tokens, pos)?;
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::Comma {
|
||||
*pos += 1;
|
||||
}
|
||||
let then = parse_add_sub(tokens, pos)?;
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::Comma {
|
||||
*pos += 1;
|
||||
}
|
||||
let else_ = parse_add_sub(tokens, pos)?;
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
|
||||
*pos += 1;
|
||||
} else {
|
||||
return Err(anyhow!("Expected ')' to close IF(...)"));
|
||||
}
|
||||
return Ok(Expr::If(Box::new(cond), Box::new(then), Box::new(else_)));
|
||||
}
|
||||
Ok(Expr::Ref(name))
|
||||
}
|
||||
_ => Ok(Expr::Ref(name)),
|
||||
}
|
||||
}
|
||||
Token::LParen => {
|
||||
*pos += 1;
|
||||
let e = parse_add_sub(tokens, pos)?;
|
||||
if *pos < tokens.len() && tokens[*pos] == Token::RParen {
|
||||
*pos += 1;
|
||||
}
|
||||
Ok(e)
|
||||
}
|
||||
t => Err(anyhow!("Unexpected token in expression: {t:?}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_comparison(tokens: &[Token], pos: &mut usize) -> Result<Expr> {
|
||||
let left = parse_add_sub(tokens, pos)?;
|
||||
if *pos >= tokens.len() {
|
||||
return Ok(left);
|
||||
}
|
||||
let op = match &tokens[*pos] {
|
||||
Token::Eq => BinOp::Eq,
|
||||
Token::Ne => BinOp::Ne,
|
||||
Token::Lt => BinOp::Lt,
|
||||
Token::Gt => BinOp::Gt,
|
||||
Token::Le => BinOp::Le,
|
||||
Token::Ge => BinOp::Ge,
|
||||
_ => return Ok(left),
|
||||
};
|
||||
*pos += 1;
|
||||
let right = parse_add_sub(tokens, pos)?;
|
||||
Ok(Expr::BinOp(op, Box::new(left), Box::new(right)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::parse_formula;
|
||||
@ -544,8 +403,8 @@ mod tests {
|
||||
assert_eq!(filter.item, "East");
|
||||
}
|
||||
|
||||
/// Regression: WHERE inside aggregate parens must tokenize correctly.
|
||||
/// The tokenizer must not merge "Revenue WHERE" into a single identifier.
|
||||
/// Regression: WHERE inside aggregate parens must parse as the
|
||||
/// aggregate's inline filter, not as a top-level WHERE clause.
|
||||
#[test]
|
||||
fn parse_sum_with_inline_where_filter() {
|
||||
let f = parse_formula("EastTotal = SUM(Revenue WHERE Region = \"East\")", "Foo").unwrap();
|
||||
@ -562,7 +421,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_if_with_comparison_operators() {
|
||||
// Test each comparison operator in an IF expression
|
||||
let f = parse_formula("X = IF(A != 0, A, 1)", "Cat").unwrap();
|
||||
assert!(matches!(f.expr, Expr::If(_, _, _)));
|
||||
|
||||
@ -583,7 +441,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_where_with_quoted_string_inside_expression() {
|
||||
// WHERE inside a formula string with quotes
|
||||
let f = parse_formula("X = Revenue WHERE Region = \"West Coast\"", "Foo").unwrap();
|
||||
let filter = f.filter.as_ref().unwrap();
|
||||
assert_eq!(filter.item, "West Coast");
|
||||
@ -650,11 +507,10 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Quoted string in tokenizer ──────────────────────────────────────
|
||||
// ── Quoted string in WHERE ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn parse_quoted_string_in_where() {
|
||||
// Quoted strings work in top-level WHERE clauses
|
||||
let f = parse_formula("X = Revenue WHERE Region = \"East\"", "Cat").unwrap();
|
||||
let filter = f.filter.as_ref().unwrap();
|
||||
assert_eq!(filter.item, "East");
|
||||
@ -681,27 +537,21 @@ mod tests {
|
||||
assert!(parse_expr("").is_err());
|
||||
}
|
||||
|
||||
// ── Multi-word identifiers must be pipe-quoted ──────────────────────
|
||||
|
||||
#[test]
|
||||
fn tokenizer_breaks_at_where_keyword() {
|
||||
use super::tokenize;
|
||||
let tokens = tokenize("Revenue WHERE Region").unwrap();
|
||||
// Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region")
|
||||
assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
|
||||
fn multi_word_bare_identifier_is_rejected() {
|
||||
// Multi-word identifiers must be pipe-quoted; bare multi-word fails
|
||||
// the `bare_name`-compatible grammar rule.
|
||||
assert!(parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").is_err());
|
||||
}
|
||||
|
||||
// ── Multi-word identifiers ──────────────────────────────────────────
|
||||
// ── WHERE inside quotes in the expression ───────────────────────────
|
||||
|
||||
#[test]
|
||||
fn parse_multi_word_identifier() {
|
||||
let f = parse_formula("Total Revenue = Base Revenue + Bonus", "Foo").unwrap();
|
||||
assert_eq!(f.target, "Total Revenue");
|
||||
}
|
||||
|
||||
// ── WHERE inside quotes in split_where ──────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn split_where_ignores_where_inside_quotes() {
|
||||
// WHERE inside quotes should not be treated as a keyword
|
||||
fn where_inside_quotes_is_not_a_keyword() {
|
||||
// A filter value containing the literal text "WHERE" is parsed as
|
||||
// a string, not as a nested WHERE keyword.
|
||||
let f = parse_formula("X = Revenue WHERE Region = \"WHERE\"", "Foo").unwrap();
|
||||
let filter = f.filter.as_ref().unwrap();
|
||||
assert_eq!(filter.item, "WHERE");
|
||||
@ -773,4 +623,17 @@ mod tests {
|
||||
panic!("Expected SUM with WHERE filter, got: {:?}", f.expr);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Pipe-quoted escape semantics ────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn pipe_quoted_escape_literal_pipe() {
|
||||
// \| inside a pipe-quoted identifier is a literal pipe
|
||||
let f = parse_formula("X = |A\\|B|", "Cat").unwrap();
|
||||
if let Expr::Ref(ref s) = f.expr {
|
||||
assert_eq!(s, "A|B");
|
||||
} else {
|
||||
panic!("Expected Ref, got: {:?}", f.expr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user