feat(formula): support pipe-quoted identifiers |...|

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Edward Langley
2026-04-08 23:55:06 -07:00
parent 35ed6a13bf
commit fb8b6ca053
2 changed files with 125 additions and 13 deletions

View File

@ -115,17 +115,28 @@ Formulas are parsed into a typed AST (`Expr` enum) at entry time. If the syntax
is invalid, the user gets an error immediately. The evaluator only sees
well-formed trees — it does not need to handle malformed input.
### Formula Tokenizer: Multi-Word Identifiers and Keywords
### Formula Tokenizer: Identifiers and Quoting
The formula tokenizer supports multi-word identifiers (e.g., `Total Revenue`)
by allowing spaces within identifier tokens when followed by non-operator
characters. However, keywords (`WHERE`, `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`,
`IF`) act as token boundaries — the tokenizer breaks an identifier when:
1. The identifier collected **so far** is a keyword (e.g., `WHERE ` stops at `WHERE`).
2. The **next word** after a space is a keyword (e.g., `Revenue WHERE` stops at `Revenue`).
**Bare identifiers** support multi-word names (e.g., `Total Revenue`) by
allowing spaces when followed by non-operator, non-keyword characters. Keywords
(`WHERE`, `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`, `IF`) act as token boundaries.
This ensures `SUM(Revenue WHERE Region = "East")` tokenizes correctly as
separate tokens while `Total Revenue` remains a single identifier.
**Pipe-quoted identifiers** (`|...|`) allow any characters — including spaces,
keywords, and operators — inside the delimiters. Use pipes when a category or
item name collides with a keyword or contains special characters:
```
|WHERE| — category named "WHERE"
|Revenue (USD)| — name with parens
|Cost + Tax| — name with operator chars
SUM(|Net Revenue| WHERE |Region Name| = |East Coast|)
```
Pipes produce `Token::Ident` (same as bare identifiers), so they work
everywhere an identifier is expected: expressions, aggregate arguments, WHERE
clause category names and filter values. Double-quoted strings (`"..."`)
remain `Token::Str` and are used only for WHERE filter values in the
`split_where` pre-parse step.
---

View File

@ -38,6 +38,12 @@ fn split_where(s: &str) -> (&str, Option<&str>) {
i += 1;
}
}
b'|' => {
i += 1;
while i < bytes.len() && bytes[i] != b'|' {
i += 1;
}
}
_ if depth == 0 => {
if s[i..].to_ascii_uppercase().starts_with("WHERE") {
let before = &s[..i];
@ -54,14 +60,23 @@ fn split_where(s: &str) -> (&str, Option<&str>) {
(s, None)
}
/// Strip pipe or double-quote delimiters from a value.
fn unquote(s: &str) -> String {
let s = s.trim();
if (s.starts_with('"') && s.ends_with('"')) || (s.starts_with('|') && s.ends_with('|')) {
s[1..s.len() - 1].to_string()
} else {
s.to_string()
}
}
fn parse_where(s: &str) -> Result<Filter> {
// Format: Category = "Item" or Category = Item
// Format: Category = "Item" or Category = |Item| or Category = Item
let eq_pos = s
.find('=')
.ok_or_else(|| anyhow!("WHERE clause must contain '=': {s}"))?;
let category = s[..eq_pos].trim().to_string();
let item_raw = s[eq_pos + 1..].trim();
let item = item_raw.trim_matches('"').to_string();
let category = unquote(&s[..eq_pos]);
let item = unquote(&s[eq_pos + 1..]);
Ok(Filter { category, item })
}
@ -176,6 +191,18 @@ fn tokenize(s: &str) -> Result<Vec<Token>> {
}
tokens.push(Token::Str(s));
}
'|' => {
i += 1;
let mut s = String::new();
while i < chars.len() && chars[i] != '|' {
s.push(chars[i]);
i += 1;
}
if i < chars.len() {
i += 1;
}
tokens.push(Token::Ident(s));
}
c if c.is_ascii_digit() || c == '.' => {
let mut num = String::new();
while i < chars.len() && (chars[i].is_ascii_digit() || chars[i] == '.') {
@ -695,4 +722,78 @@ mod tests {
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "WHERE");
}
// ── Pipe-quoted identifiers ─────────────────────────────────────────
#[test]
fn pipe_quoted_identifier_in_expression() {
let f = parse_formula("|Total Revenue| = |Base Revenue| + Bonus", "Measure").unwrap();
assert_eq!(f.target, "|Total Revenue|");
if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::Ref(ref s) if s == "Base Revenue"));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "Bonus"));
} else {
panic!("Expected Add, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_keyword_as_identifier() {
// A category named "WHERE" can be referenced with pipes
let f = parse_formula("X = |WHERE| + |SUM|", "Cat").unwrap();
if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::Ref(ref s) if s == "WHERE"));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "SUM"));
} else {
panic!("Expected Add, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_identifier_with_special_chars() {
// Pipes allow characters that would normally break tokenization
let f = parse_formula("X = |Revenue (USD)| + |Cost + Tax|", "Cat").unwrap();
if let Expr::BinOp(BinOp::Add, lhs, rhs) = &f.expr {
assert!(matches!(**lhs, Expr::Ref(ref s) if s == "Revenue (USD)"));
assert!(matches!(**rhs, Expr::Ref(ref s) if s == "Cost + Tax"));
} else {
panic!("Expected Add, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_in_aggregate() {
let f = parse_formula("X = SUM(|Net Revenue|)", "Cat").unwrap();
if let Expr::Agg(AggFunc::Sum, inner, None) = &f.expr {
assert!(matches!(**inner, Expr::Ref(ref s) if s == "Net Revenue"));
} else {
panic!("Expected SUM aggregate, got: {:?}", f.expr);
}
}
#[test]
fn pipe_quoted_in_where_filter_value() {
let f = parse_formula(
"X = Revenue WHERE Region = |East Coast|",
"Measure",
)
.unwrap();
let filter = f.filter.as_ref().unwrap();
assert_eq!(filter.item, "East Coast");
}
#[test]
fn pipe_quoted_in_inline_where() {
let f = parse_formula(
"X = SUM(Revenue WHERE |Region Name| = |East Coast|)",
"Measure",
)
.unwrap();
if let Expr::Agg(AggFunc::Sum, _, Some(filter)) = &f.expr {
assert_eq!(filter.category, "Region Name");
assert_eq!(filter.item, "East Coast");
} else {
panic!("Expected SUM with WHERE filter, got: {:?}", f.expr);
}
}
}