fix(formula): break tokenizer identifiers when current word is a keyword

The tokenizer already broke multi-word identifiers when the NEXT word
was a keyword, but not when the identifier collected SO FAR was a
keyword. This meant "WHERE Region" was merged into one token when
tokenizing "SUM(Revenue WHERE Region = East)".

Now the tokenizer also checks if the identifier built up to the current
space IS a keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF), which
correctly produces separate tokens for "Revenue", "WHERE", "Region".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Edward Langley
2026-04-08 23:41:26 -07:00
parent 4c8ba6400b
commit e67f4d5a92

View File

@ -212,6 +212,14 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
) { ) {
break; break;
} }
// Break if the identifier collected so far is a keyword
let trimmed = ident.trim_end().to_ascii_uppercase();
if matches!(
trimmed.as_str(),
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
) {
break;
}
// Also break if the next word is a keyword // Also break if the next word is a keyword
let rest: String = chars[j..].iter().collect(); let rest: String = chars[j..].iter().collect();
let next_word: String = rest let next_word: String = rest
@ -500,15 +508,6 @@ mod tests {
// ── Aggregate with WHERE filter ───────────────────────────────────── // ── Aggregate with WHERE filter ─────────────────────────────────────
/// NOTE: WHERE inside aggregate parens is broken when the inner expression
/// is a bare identifier. The tokenizer treats "Revenue WHERE" as a single
/// multi-word identifier because it greedily consumes spaces followed by
/// non-operator characters. The WHERE-inside-aggregate syntax only works
/// if the inner expression is a number, parenthesized, or otherwise
/// terminated before the WHERE keyword.
///
/// Top-level WHERE (outside parens) works fine because split_where handles
/// it before tokenization.
#[test] #[test]
fn parse_sum_with_top_level_where_works() { fn parse_sum_with_top_level_where_works() {
let f = parse_formula( let f = parse_formula(
@ -667,6 +666,14 @@ mod tests {
assert!(parse_expr("").is_err()); assert!(parse_expr("").is_err());
} }
#[test]
fn tokenizer_breaks_at_where_keyword() {
use super::tokenize;
let tokens = tokenize("Revenue WHERE Region").unwrap();
// Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region")
assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
}
// ── Multi-word identifiers ────────────────────────────────────────── // ── Multi-word identifiers ──────────────────────────────────────────
#[test] #[test]