feat(formula): improve tokenizer to correctly handle keywords and delimiters

The tokenizer was greedily consuming spaces and potentially merging identifiers with subsequent keywords. This change improves the tokenizer by: - Peeking ahead past spaces to find the next word/token. - Breaking the identifier if the next word is a known keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF). - Adding support for more delimiter characters (<, >, =, !, "). This fixes a regression where "Revenue WHERE" was treated as a single identifier instead of an identifier followed by a WHERE clause. Includes a new regression test for inline WHERE filters in aggregate functions. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
2026-04-08 23:30:45 -07:00
parent a83a4f604f
commit d14ec443c2
1 changed files with 38 additions and 1 deletions
--- a/src/formula/parser.rs
+++ b/src/formula/parser.rs
@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
                {
                    // Don't consume trailing spaces if next non-space is operator
                    if chars[i] == ' ' {
-                        // Peek ahead
+                        // Peek ahead past spaces to find the next word/token
                        let j = i + 1;
                        let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
                        if matches!(
@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
                                | Some('^')
                                | Some(')')
                                | Some(',')
+                                | Some('<')
+                                | Some('>')
+                                | Some('=')
+                                | Some('!')
+                                | Some('"')
                                | None
                        ) {
                            break;
                        }
+                        // Also break if the next word is a keyword
+                        let rest: String = chars[j..].iter().collect();
+                        let next_word: String = rest
+                            .trim_start()
+                            .chars()
+                            .take_while(|c| c.is_alphanumeric() || *c == '_')
+                            .collect();
+                        let upper = next_word.to_ascii_uppercase();
+                        if matches!(
+                            upper.as_str(),
+                            "WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
+                        ) {
+                            break;
+                        }
                    }
                    ident.push(chars[i]);
                    i += 1;
@ -503,6 +522,24 @@ mod tests {
        assert_eq!(filter.item, "East");
    }

+    /// Regression: WHERE inside aggregate parens must tokenize correctly.
+    /// The tokenizer must not merge "Revenue WHERE" into a single identifier.
+    #[test]
+    fn parse_sum_with_inline_where_filter() {
+        let f = parse_formula(
+            "EastTotal = SUM(Revenue WHERE Region = \"East\")",
+            "Measure",
+        )
+        .unwrap();
+        if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr {
+            assert!(matches!(**inner, Expr::Ref(_)));
+            assert_eq!(filter.category, "Region");
+            assert_eq!(filter.item, "East");
+        } else {
+            panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr);
+        }
+    }
+
    // ── Comparison operators ────────────────────────────────────────────

    #[test]