fix(formula): break tokenizer identifiers when current word is a keyword
The tokenizer already broke multi-word identifiers when the NEXT word was a keyword, but not when the identifier collected SO FAR was a keyword. This meant "WHERE Region" was merged into one token when tokenizing "SUM(Revenue WHERE Region = East)". Now the tokenizer also checks if the identifier built up to the current space IS a keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF), which correctly produces separate tokens for "Revenue", "WHERE", "Region". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -212,6 +212,14 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// Break if the identifier collected so far is a keyword
|
||||
let trimmed = ident.trim_end().to_ascii_uppercase();
|
||||
if matches!(
|
||||
trimmed.as_str(),
|
||||
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// Also break if the next word is a keyword
|
||||
let rest: String = chars[j..].iter().collect();
|
||||
let next_word: String = rest
|
||||
@ -500,15 +508,6 @@ mod tests {
|
||||
|
||||
// ── Aggregate with WHERE filter ─────────────────────────────────────
|
||||
|
||||
/// NOTE: WHERE inside aggregate parens is broken when the inner expression
|
||||
/// is a bare identifier. The tokenizer treats "Revenue WHERE" as a single
|
||||
/// multi-word identifier because it greedily consumes spaces followed by
|
||||
/// non-operator characters. The WHERE-inside-aggregate syntax only works
|
||||
/// if the inner expression is a number, parenthesized, or otherwise
|
||||
/// terminated before the WHERE keyword.
|
||||
///
|
||||
/// Top-level WHERE (outside parens) works fine because split_where handles
|
||||
/// it before tokenization.
|
||||
#[test]
|
||||
fn parse_sum_with_top_level_where_works() {
|
||||
let f = parse_formula(
|
||||
@ -667,6 +666,14 @@ mod tests {
|
||||
assert!(parse_expr("").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenizer_breaks_at_where_keyword() {
|
||||
use super::tokenize;
|
||||
let tokens = tokenize("Revenue WHERE Region").unwrap();
|
||||
// Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region")
|
||||
assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
|
||||
}
|
||||
|
||||
// ── Multi-word identifiers ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user