feat(formula): improve tokenizer to correctly handle keywords and delimiters
The tokenizer was greedily consuming spaces and potentially merging identifiers with subsequent keywords. This change improves the tokenizer by: - Peeking ahead past spaces to find the next word/token. - Breaking the identifier if the next word is a known keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF). - Adding support for more delimiter characters (<, >, =, !, "). This fixes a regression where "Revenue WHERE" was treated as a single identifier instead of an identifier followed by a WHERE clause. Includes a new regression test for inline WHERE filters in aggregate functions. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
|||||||
{
|
{
|
||||||
// Don't consume trailing spaces if next non-space is operator
|
// Don't consume trailing spaces if next non-space is operator
|
||||||
if chars[i] == ' ' {
|
if chars[i] == ' ' {
|
||||||
// Peek ahead
|
// Peek ahead past spaces to find the next word/token
|
||||||
let j = i + 1;
|
let j = i + 1;
|
||||||
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
|
let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
|
||||||
if matches!(
|
if matches!(
|
||||||
@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
|||||||
| Some('^')
|
| Some('^')
|
||||||
| Some(')')
|
| Some(')')
|
||||||
| Some(',')
|
| Some(',')
|
||||||
|
| Some('<')
|
||||||
|
| Some('>')
|
||||||
|
| Some('=')
|
||||||
|
| Some('!')
|
||||||
|
| Some('"')
|
||||||
| None
|
| None
|
||||||
) {
|
) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// Also break if the next word is a keyword
|
||||||
|
let rest: String = chars[j..].iter().collect();
|
||||||
|
let next_word: String = rest
|
||||||
|
.trim_start()
|
||||||
|
.chars()
|
||||||
|
.take_while(|c| c.is_alphanumeric() || *c == '_')
|
||||||
|
.collect();
|
||||||
|
let upper = next_word.to_ascii_uppercase();
|
||||||
|
if matches!(
|
||||||
|
upper.as_str(),
|
||||||
|
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||||
|
) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ident.push(chars[i]);
|
ident.push(chars[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
@ -503,6 +522,24 @@ mod tests {
|
|||||||
assert_eq!(filter.item, "East");
|
assert_eq!(filter.item, "East");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Regression: WHERE inside aggregate parens must tokenize correctly.
|
||||||
|
/// The tokenizer must not merge "Revenue WHERE" into a single identifier.
|
||||||
|
#[test]
|
||||||
|
fn parse_sum_with_inline_where_filter() {
|
||||||
|
let f = parse_formula(
|
||||||
|
"EastTotal = SUM(Revenue WHERE Region = \"East\")",
|
||||||
|
"Measure",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr {
|
||||||
|
assert!(matches!(**inner, Expr::Ref(_)));
|
||||||
|
assert_eq!(filter.category, "Region");
|
||||||
|
assert_eq!(filter.item, "East");
|
||||||
|
} else {
|
||||||
|
panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Comparison operators ────────────────────────────────────────────
|
// ── Comparison operators ────────────────────────────────────────────
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user