fix(formula): break tokenizer identifiers when current word is a keyword
The tokenizer already broke multi-word identifiers when the NEXT word was a keyword, but not when the identifier collected SO FAR was a keyword. This meant "WHERE Region" was merged into one token when tokenizing "SUM(Revenue WHERE Region = East)". Now the tokenizer also checks if the identifier built up to the current space IS a keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF), which correctly produces separate tokens for "Revenue", "WHERE", "Region". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -212,6 +212,14 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
|
|||||||
) {
|
) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// Break if the identifier collected so far is a keyword
|
||||||
|
let trimmed = ident.trim_end().to_ascii_uppercase();
|
||||||
|
if matches!(
|
||||||
|
trimmed.as_str(),
|
||||||
|
"WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
|
||||||
|
) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
// Also break if the next word is a keyword
|
// Also break if the next word is a keyword
|
||||||
let rest: String = chars[j..].iter().collect();
|
let rest: String = chars[j..].iter().collect();
|
||||||
let next_word: String = rest
|
let next_word: String = rest
|
||||||
@ -500,15 +508,6 @@ mod tests {
|
|||||||
|
|
||||||
// ── Aggregate with WHERE filter ─────────────────────────────────────
|
// ── Aggregate with WHERE filter ─────────────────────────────────────
|
||||||
|
|
||||||
/// NOTE: WHERE inside aggregate parens is broken when the inner expression
|
|
||||||
/// is a bare identifier. The tokenizer treats "Revenue WHERE" as a single
|
|
||||||
/// multi-word identifier because it greedily consumes spaces followed by
|
|
||||||
/// non-operator characters. The WHERE-inside-aggregate syntax only works
|
|
||||||
/// if the inner expression is a number, parenthesized, or otherwise
|
|
||||||
/// terminated before the WHERE keyword.
|
|
||||||
///
|
|
||||||
/// Top-level WHERE (outside parens) works fine because split_where handles
|
|
||||||
/// it before tokenization.
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_sum_with_top_level_where_works() {
|
fn parse_sum_with_top_level_where_works() {
|
||||||
let f = parse_formula(
|
let f = parse_formula(
|
||||||
@ -667,6 +666,14 @@ mod tests {
|
|||||||
assert!(parse_expr("").is_err());
|
assert!(parse_expr("").is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenizer_breaks_at_where_keyword() {
|
||||||
|
use super::tokenize;
|
||||||
|
let tokens = tokenize("Revenue WHERE Region").unwrap();
|
||||||
|
// Should produce 3 tokens: Ident("Revenue"), Ident("WHERE"), Ident("Region")
|
||||||
|
assert_eq!(tokens.len(), 3, "Expected 3 tokens, got: {tokens:?}");
|
||||||
|
}
|
||||||
|
|
||||||
// ── Multi-word identifiers ──────────────────────────────────────────
|
// ── Multi-word identifiers ──────────────────────────────────────────
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user