From d14ec443c27c8250c9e11a50cfcb093ba8da4a36 Mon Sep 17 00:00:00 2001 From: Edward Langley Date: Wed, 8 Apr 2026 23:30:45 -0700 Subject: [PATCH] feat(formula): improve tokenizer to correctly handle keywords and delimiters The tokenizer was greedily consuming spaces and potentially merging identifiers with subsequent keywords. This change improves the tokenizer by: - Peeking ahead past spaces to find the next word/token. - Breaking the identifier if the next word is a known keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF). - Adding support for more delimiter characters (<, >, =, !, "). This fixes a regression where "Revenue WHERE" was treated as a single identifier instead of an identifier followed by a WHERE clause. Includes a new regression test for inline WHERE filters in aggregate functions. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL) --- src/formula/parser.rs | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/formula/parser.rs b/src/formula/parser.rs index a58781c..4fd39d8 100644 --- a/src/formula/parser.rs +++ b/src/formula/parser.rs @@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result> { { // Don't consume trailing spaces if next non-space is operator if chars[i] == ' ' { - // Peek ahead + // Peek ahead past spaces to find the next word/token let j = i + 1; let next_nonspace = chars[j..].iter().find(|&&c| c != ' '); if matches!( @@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result> { | Some('^') | Some(')') | Some(',') + | Some('<') + | Some('>') + | Some('=') + | Some('!') + | Some('"') | None ) { break; } + // Also break if the next word is a keyword + let rest: String = chars[j..].iter().collect(); + let next_word: String = rest + .trim_start() + .chars() + .take_while(|c| c.is_alphanumeric() || *c == '_') + .collect(); + let upper = next_word.to_ascii_uppercase(); + if matches!( + upper.as_str(), + "WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF" + ) { + break; + } } ident.push(chars[i]); i += 1; @@ -503,6 +522,24 @@ mod tests { assert_eq!(filter.item, "East"); } + /// Regression: WHERE inside aggregate parens must tokenize correctly. + /// The tokenizer must not merge "Revenue WHERE" into a single identifier. + #[test] + fn parse_sum_with_inline_where_filter() { + let f = parse_formula( + "EastTotal = SUM(Revenue WHERE Region = \"East\")", + "Measure", + ) + .unwrap(); + if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr { + assert!(matches!(**inner, Expr::Ref(_))); + assert_eq!(filter.category, "Region"); + assert_eq!(filter.item, "East"); + } else { + panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr); + } + } + // ── Comparison operators ──────────────────────────────────────────── #[test]