feat(formula): improve tokenizer to correctly handle keywords and delimiters

The tokenizer was greedily consuming spaces and potentially merging
identifiers with subsequent keywords. This change improves the tokenizer
by:
- Peeking ahead past spaces to find the next word/token.
- Breaking the identifier if the next word is a known keyword (WHERE, SUM,
  AVG, MIN, MAX, COUNT, IF).
- Adding support for more delimiter characters (<, >, =, !, ").

This fixes a regression where "Revenue WHERE" was treated as a single
identifier instead of an identifier followed by a WHERE clause.

Includes a new regression test for inline WHERE filters in aggregate
functions.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
Edward Langley
2026-04-08 23:30:45 -07:00
parent a83a4f604f
commit d14ec443c2

View File

@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
{
// Don't consume trailing spaces if next non-space is operator
if chars[i] == ' ' {
// Peek ahead
// Peek ahead past spaces to find the next word/token
let j = i + 1;
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
if matches!(
@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
| Some('^')
| Some(')')
| Some(',')
| Some('<')
| Some('>')
| Some('=')
| Some('!')
| Some('"')
| None
) {
break;
}
// Also break if the next word is a keyword
let rest: String = chars[j..].iter().collect();
let next_word: String = rest
.trim_start()
.chars()
.take_while(|c| c.is_alphanumeric() || *c == '_')
.collect();
let upper = next_word.to_ascii_uppercase();
if matches!(
upper.as_str(),
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
) {
break;
}
}
ident.push(chars[i]);
i += 1;
@ -503,6 +522,24 @@ mod tests {
assert_eq!(filter.item, "East");
}
/// Regression: WHERE inside aggregate parens must tokenize correctly.
/// The tokenizer must not merge "Revenue WHERE" into a single identifier.
#[test]
fn parse_sum_with_inline_where_filter() {
let f = parse_formula(
"EastTotal = SUM(Revenue WHERE Region = \"East\")",
"Measure",
)
.unwrap();
if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr {
assert!(matches!(**inner, Expr::Ref(_)));
assert_eq!(filter.category, "Region");
assert_eq!(filter.item, "East");
} else {
panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr);
}
}
// ── Comparison operators ────────────────────────────────────────────
#[test]