diff --git a/context/design-principles.md b/context/design-principles.md index 2bfe0b5..137b790 100644 --- a/context/design-principles.md +++ b/context/design-principles.md @@ -115,17 +115,28 @@ Formulas are parsed into a typed AST (`Expr` enum) at entry time. If the syntax is invalid, the user gets an error immediately. The evaluator only sees well-formed trees — it does not need to handle malformed input. -### Formula Tokenizer: Multi-Word Identifiers and Keywords +### Formula Tokenizer: Identifiers and Quoting -The formula tokenizer supports multi-word identifiers (e.g., `Total Revenue`) -by allowing spaces within identifier tokens when followed by non-operator -characters. However, keywords (`WHERE`, `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`, -`IF`) act as token boundaries — the tokenizer breaks an identifier when: -1. The identifier collected **so far** is a keyword (e.g., `WHERE ` stops at `WHERE`). -2. The **next word** after a space is a keyword (e.g., `Revenue WHERE` stops at `Revenue`). +**Bare identifiers** support multi-word names (e.g., `Total Revenue`) by +allowing spaces when followed by non-operator, non-keyword characters. Keywords +(`WHERE`, `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`, `IF`) act as token boundaries. -This ensures `SUM(Revenue WHERE Region = "East")` tokenizes correctly as -separate tokens while `Total Revenue` remains a single identifier. +**Pipe-quoted identifiers** (`|...|`) allow any characters — including spaces, +keywords, and operators — inside the delimiters. Use pipes when a category or +item name collides with a keyword or contains special characters: + +``` +|WHERE| — category named "WHERE" +|Revenue (USD)| — name with parens +|Cost + Tax| — name with operator chars +SUM(|Net Revenue| WHERE |Region Name| = |East Coast|) +``` + +Pipes produce `Token::Ident` (same as bare identifiers), so they work +everywhere an identifier is expected: expressions, aggregate arguments, WHERE +clause category names and filter values. Double-quoted strings (`"..."`) +remain `Token::Str` and are used only for WHERE filter values in the +`split_where` pre-parse step. --- diff --git a/src/formula/parser.rs b/src/formula/parser.rs index 71e4b8f..337eb73 100644 --- a/src/formula/parser.rs +++ b/src/formula/parser.rs @@ -38,6 +38,12 @@ fn split_where(s: &str) -> (&str, Option<&str>) { i += 1; } } + b'|' => { + i += 1; + while i < bytes.len() && bytes[i] != b'|' { + i += 1; + } + } _ if depth == 0 => { if s[i..].to_ascii_uppercase().starts_with("WHERE") { let before = &s[..i]; @@ -54,14 +60,23 @@ fn split_where(s: &str) -> (&str, Option<&str>) { (s, None) } +/// Strip pipe or double-quote delimiters from a value. +fn unquote(s: &str) -> String { + let s = s.trim(); + if (s.starts_with('"') && s.ends_with('"')) || (s.starts_with('|') && s.ends_with('|')) { + s[1..s.len() - 1].to_string() + } else { + s.to_string() + } +} + fn parse_where(s: &str) -> Result { - // Format: Category = "Item" or Category = Item + // Format: Category = "Item" or Category = |Item| or Category = Item let eq_pos = s .find('=') .ok_or_else(|| anyhow!("WHERE clause must contain '=': {s}"))?; - let category = s[..eq_pos].trim().to_string(); - let item_raw = s[eq_pos + 1..].trim(); - let item = item_raw.trim_matches('"').to_string(); + let category = unquote(&s[..eq_pos]); + let item = unquote(&s[eq_pos + 1..]); Ok(Filter { category, item }) } @@ -176,6 +191,18 @@ fn tokenize(s: &str) -> Result> { } tokens.push(Token::Str(s)); } + '|' => { + i += 1; + let mut s = String::new(); + while i < chars.len() && chars[i] != '|' { + s.push(chars[i]); + i += 1; + } + if i < chars.len() { + i += 1; + } + tokens.push(Token::Ident(s)); + } c if c.is_ascii_digit() || c == '.' => { let mut num = String::new(); while i < chars.len() && (chars[i].is_ascii_digit() || chars[i] == '.') { @@ -695,4 +722,78 @@ mod tests { let filter = f.filter.as_ref().unwrap(); assert_eq!(filter.item, "WHERE"); } + + // ── Pipe-quoted identifiers ───────────────────────────────────────── + + #[test] + fn pipe_quoted_identifier_in_expression() { + let f = parse_formula("|Total Revenue| = |Base Revenue| + Bonus", "Measure").unwrap(); + assert_eq!(f.target, "|Total Revenue|"); + if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::Ref(ref s) if s == "Base Revenue")); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "Bonus")); + } else { + panic!("Expected Add, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_keyword_as_identifier() { + // A category named "WHERE" can be referenced with pipes + let f = parse_formula("X = |WHERE| + |SUM|", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::Ref(ref s) if s == "WHERE")); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "SUM")); + } else { + panic!("Expected Add, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_identifier_with_special_chars() { + // Pipes allow characters that would normally break tokenization + let f = parse_formula("X = |Revenue (USD)| + |Cost + Tax|", "Cat").unwrap(); + if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr { + assert!(matches!(**lhs, Expr::Ref(ref s) if s == "Revenue (USD)")); + assert!(matches!(**rhs, Expr::Ref(ref s) if s == "Cost + Tax")); + } else { + panic!("Expected Add, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_in_aggregate() { + let f = parse_formula("X = SUM(|Net Revenue|)", "Cat").unwrap(); + if let Expr::Agg(AggFunc::Sum, inner, None) = &f.expr { + assert!(matches!(**inner, Expr::Ref(ref s) if s == "Net Revenue")); + } else { + panic!("Expected SUM aggregate, got: {:?}", f.expr); + } + } + + #[test] + fn pipe_quoted_in_where_filter_value() { + let f = parse_formula( + "X = Revenue WHERE Region = |East Coast|", + "Measure", + ) + .unwrap(); + let filter = f.filter.as_ref().unwrap(); + assert_eq!(filter.item, "East Coast"); + } + + #[test] + fn pipe_quoted_in_inline_where() { + let f = parse_formula( + "X = SUM(Revenue WHERE |Region Name| = |East Coast|)", + "Measure", + ) + .unwrap(); + if let Expr::Agg(AggFunc::Sum, _, Some(filter)) = &f.expr { + assert_eq!(filter.category, "Region Name"); + assert_eq!(filter.item, "East Coast"); + } else { + panic!("Expected SUM with WHERE filter, got: {:?}", f.expr); + } + } }