feat(formula): improve tokenizer to correctly handle keywords and delimiters

The tokenizer was greedily consuming spaces and potentially merging identifiers with subsequent keywords. This change improves the tokenizer by: - Peeking ahead past spaces to find the next word/token. - Breaking the identifier if the next word is a known keyword (WHERE, SUM, AVG, MIN, MAX, COUNT, IF). - Adding support for more delimiter characters (<, >, =, !, "). This fixes a regression where "Revenue WHERE" was treated as a single identifier instead of an identifier followed by a WHERE clause. Includes a new regression test for inline WHERE filters in aggregate functions. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
2026-04-08 23:30:45 -07:00
parent a83a4f604f
commit d14ec443c2
1 changed files with 38 additions and 1 deletions
@@ -191,7 +191,7 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
                {
                    // Don't consume trailing spaces if next non-space is operator
                    if chars[i] == ' ' {
-                        // Peek ahead
+                        // Peek ahead past spaces to find the next word/token
                        let j = i + 1;
                        let next_nonspace = chars[j..].iter().find(|&&c| c != ' ');
                        if matches!(
@@ -203,10 +203,29 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
                                | Some('^')
                                | Some(')')
                                | Some(',')
                                | Some('<')
                                | Some('>')
                                | Some('=')
                                | Some('!')
                                | Some('"')
                                | None
                        ) {
                            break;
                        }
                        // Also break if the next word is a keyword
                        let rest: String = chars[j..].iter().collect();
                        let next_word: String = rest
                            .trim_start()
                            .chars()
                            .take_while(|c| c.is_alphanumeric() || *c == '_')
                            .collect();
                        let upper = next_word.to_ascii_uppercase();
                        if matches!(
                            upper.as_str(),
                            "WHERE" | "SUM" | "AVG" | "MIN" | "MAX" | "COUNT" | "IF"
                        ) {
                            break;
                        }
                    }
                    ident.push(chars[i]);
                    i += 1;
@@ -503,6 +522,24 @@ mod tests {
        assert_eq!(filter.item, "East");
    }
    /// Regression: WHERE inside aggregate parens must tokenize correctly.
    /// The tokenizer must not merge "Revenue WHERE" into a single identifier.
    #[test]
    fn parse_sum_with_inline_where_filter() {
        let f = parse_formula(
            "EastTotal = SUM(Revenue WHERE Region = \"East\")",
            "Measure",
        )
        .unwrap();
        if let Expr::Agg(AggFunc::Sum, inner, Some(filter)) = &f.expr {
            assert!(matches!(**inner, Expr::Ref(_)));
            assert_eq!(filter.category, "Region");
            assert_eq!(filter.item, "East");
        } else {
            panic!("Expected SUM with inline WHERE filter, got: {:?}", f.expr);
        }
    }
    // ── Comparison operators ────────────────────────────────────────────
    #[test]