feat(formula): improve tokenizer to correctly handle keywords and delimiters
The tokenizer was greedily consuming spaces and potentially merging identifiers with subsequent keywords. This change improves the tokenizer by: - Peeking ahead past spaces to find the next word/token. - Breaking the identifier if the next word is a known keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF). - Adding support for more delimiter characters (<, >, =, !, "). This fixes a regression where "Revenue WHERE" was treated as a single identifier instead of an identifier followed by a WHERE clause. Includes a new regression test for inline WHERE filters in aggregate functions. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
||||
{
|
||||
// Don't consume trailing spaces if next non-space is operator
|
||||
if chars[i] == ' ' {
|
||||
// Peek ahead
|
||||
// Peek ahead past spaces to find the next word/token
|
||||
let j = i + 1;
|
||||
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
|
||||
if matches!(
|
||||
@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
||||
| Some('^')
|
||||
| Some(')')
|
||||
| Some(',')
|
||||
| Some('<')
|
||||
| Some('>')
|
||||
| Some('=')
|
||||
| Some('!')
|
||||
| Some('"')
|
||||
| None
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// Also break if the next word is a keyword
|
||||
let rest: String = chars[j..].iter().collect();
|
||||
let next_word: String = rest
|
||||
.trim_start()
|
||||
.chars()
|
||||
.take_while(|c| c.is_alphanumeric() || *c == '_')
|
||||
.collect();
|
||||
let upper = next_word.to_ascii_uppercase();
|
||||
if matches!(
|
||||
upper.as_str(),
|
||||
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||
) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ident.push(chars[i]);
|
||||
i += 1;
|
||||
@ -503,6 +522,24 @@ mod tests {
|
||||
assert_eq!(filter.item, "East");
|
||||
}
|
||||
|
||||
/// Regression: WHERE inside aggregate parens must tokenize correctly.
|
||||
/// The tokenizer must not merge "Revenue WHERE" into a single identifier.
|
||||
#[test]
|
||||
fn parse_sum_with_inline_where_filter() {
|
||||
let f = parse_formula(
|
||||
"EastTotal = SUM(Revenue WHERE Region = \"East\")",
|
||||
"Measure",
|
||||
)
|
||||
.unwrap();
|
||||
if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr {
|
||||
assert!(matches!(**inner, Expr::Ref(_)));
|
||||
assert_eq!(filter.category, "Region");
|
||||
assert_eq!(filter.item, "East");
|
||||
} else {
|
||||
panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Comparison operators ────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user