From d34e8eb3132e28dce1346b84c7e0717ad31d8d60 Mon Sep 17 00:00:00 2001 From: Edward Langley Date: Thu, 9 Apr 2026 02:53:13 -0700 Subject: [PATCH] feat: replace ad-hoc .improv parser with pest grammar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add improv.pest PEG grammar as the single source of truth for the .improv file format (v2025-04-09) - Replace hand-written line scanner with pest-derived parser that walks the grammar's parse tree - Add grammar-walking test generator that reads improv.pest at test time via pest_meta and produces random valid files from the AST - Fix 6 parser bugs: newlines in text, commas in names, brackets in names, float precision, view name ambiguity, group brackets - New format: version line, Initial View header, pipe quoting (|...|), Views→Formulas→Categories→Data section order, comma-separated items - Bare names restricted to [A-Za-z_][A-Za-z0-9_-]*, everything else pipe-quoted with \| \\ \n escapes - Remove all unwrap() calls from production code, propagate errors with Result throughout parse_md - Extract shared escape_pipe/unescape_pipe/pipe_quote helpers, deduplicate hidden/collapsed formatting, add w!() macro for infallible writeln Co-Authored-By: Claude Opus 4.6 (1M context) Executed-By: spot --- src/persistence/improv.pest | 124 ++++ src/persistence/mod.rs | 1064 +++++++++++++++++++---------------- 2 files changed, 714 insertions(+), 474 deletions(-) create mode 100644 src/persistence/improv.pest diff --git a/src/persistence/improv.pest b/src/persistence/improv.pest new file mode 100644 index 0000000..2782ea9 --- /dev/null +++ b/src/persistence/improv.pest @@ -0,0 +1,124 @@ +// ── .improv file grammar (v2025-04-09) ─────────────────────────────────────── +// +// Line-oriented, markdown-flavoured format for multi-dimensional models. +// Sections may appear in any order. +// +// Names: bare alphanumeric or pipe-quoted |like this|. +// Inside pipes, backslash escapes: \| for literal pipe, \\ for backslash, +// \n for newline. +// Values: pipe-quoted |text| or bare numbers. + +file = { + SOI ~ + blank_lines ~ + version_line ~ + model_name ~ + initial_view? ~ + section* ~ + EOI +} + +version_line = { "v" ~ rest_of_line ~ NEWLINE ~ blank_lines } +model_name = { "# " ~ rest_of_line ~ NEWLINE ~ blank_lines } +initial_view = { "Initial View: " ~ rest_of_line ~ NEWLINE ~ blank_lines } + +section = _{ + category_section + | formulas_section + | data_section + | view_section +} + +// ── Category ───────────────────────────────────────────────────────────────── + +category_section = { + "## Category: " ~ rest_of_line ~ NEWLINE ~ blank_lines ~ + category_entry* +} + +category_entry = _{ group_hierarchy | grouped_item | item_list } + +// Comma-separated bare items (no group): `- Food, Gas, Total` +item_list = { + "- " ~ name ~ ("," ~ " "* ~ name)* ~ NEWLINE ~ blank_lines +} + +// Single item with group bracket: `- Jan[Q1]` +grouped_item = { + "- " ~ name ~ "[" ~ name ~ "]" ~ NEWLINE ~ blank_lines +} + +group_hierarchy = { + "> " ~ name ~ "[" ~ name ~ "]" ~ NEWLINE ~ blank_lines +} + +// ── Formulas ───────────────────────────────────────────────────────────────── + +formulas_section = { + "## Formulas" ~ NEWLINE ~ blank_lines ~ + formula_line* +} + +formula_line = { + "- " ~ rest_of_line ~ NEWLINE ~ blank_lines +} + +// ── Data ───────────────────────────────────────────────────────────────────── + +data_section = { + "## Data" ~ NEWLINE ~ blank_lines ~ + data_line* +} + +data_line = { + coord_list ~ " = " ~ cell_value ~ NEWLINE ~ blank_lines +} + +coord_list = { coord ~ (", " ~ coord)* } +coord = { name ~ "=" ~ name } + +cell_value = _{ number | pipe_quoted | bare_value } + +number = @{ + "-"? ~ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? ~ (("e" | "E") ~ ("+" | "-")? ~ ASCII_DIGIT+)? +} + +bare_value = @{ (!NEWLINE ~ ANY)+ } + +// ── View ───────────────────────────────────────────────────────────────────── + +view_section = { + "## View: " ~ rest_of_line ~ NEWLINE ~ blank_lines ~ + view_entry* +} + +view_entry = _{ format_line | hidden_line | collapsed_line | axis_line } + +axis_line = { + name ~ ": " ~ axis_kind ~ (", " ~ name)? ~ NEWLINE ~ blank_lines +} + +axis_kind = @{ "row" | "column" | "page" | "none" } + +format_line = { "format: " ~ rest_of_line ~ NEWLINE ~ blank_lines } +hidden_line = { "hidden: " ~ name ~ "/" ~ name ~ NEWLINE ~ blank_lines } +collapsed_line = { "collapsed: " ~ name ~ "/" ~ name ~ NEWLINE ~ blank_lines } + +// ── Names ──────────────────────────────────────────────────────────────────── +// +// A name is either pipe-quoted or a bare identifier. +// Pipe-quoted: |Income, Gross| — backslash escapes inside: +// \| = literal pipe, \\ = literal backslash, \n = newline +// Bare: no = , | [ ] / : # or newlines. + +name = _{ pipe_quoted | bare_name } + +pipe_quoted = { "|" ~ pipe_inner ~ "|" } +pipe_inner = @{ ("\\" ~ ANY | !"|" ~ ANY)* } + +bare_name = @{ ('A'..'Z' | 'a'..'z' | "_") ~ ('A'..'Z' | 'a'..'z' | '0'..'9' | "_" | "-")* } + +// ── Shared ─────────────────────────────────────────────────────────────────── + +rest_of_line = @{ (!NEWLINE ~ ANY)* } +blank_lines = _{ NEWLINE* } diff --git a/src/persistence/mod.rs b/src/persistence/mod.rs index dc3b2a4..9add417 100644 --- a/src/persistence/mod.rs +++ b/src/persistence/mod.rs @@ -2,6 +2,8 @@ use anyhow::{Context, Result}; use flate2::read::GzDecoder; use flate2::write::GzEncoder; use flate2::Compression; +use pest::Parser; +use pest_derive::Parser; use std::io::{BufReader, BufWriter, Read, Write}; use std::path::Path; @@ -11,15 +13,113 @@ use crate::model::cell::{CellKey, CellValue}; use crate::model::Model; use crate::view::{Axis, GridLayout}; +#[derive(Parser)] +#[grammar = "persistence/improv.pest"] +struct ImprovParser; + +// ── Pipe quoting (shared between format and parse) ─────────────────────────── + +/// Check whether a name is a valid bare identifier: `[A-Za-z_][A-Za-z0-9_-]*` +fn is_bare_name(name: &str) -> bool { + let mut chars = name.chars(); + match chars.next() { + Some(c) if c.is_ascii_alphabetic() || c == '_' => {} + _ => return false, + } + chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') +} + +/// Escape a string for use inside pipe delimiters: `\|`, `\\`, `\n`. +fn escape_pipe(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + match c { + '|' => out.push_str("\\|"), + '\\' => out.push_str("\\\\"), + '\n' => out.push_str("\\n"), + c => out.push(c), + } + } + out +} + +/// Unescape a pipe-quoted body: `\|` → `|`, `\\` → `\`, `\n` → newline. +fn unescape_pipe(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut chars = s.chars(); + while let Some(c) = chars.next() { + if c == '\\' { + match chars.next() { + Some('|') => out.push('|'), + Some('\\') => out.push('\\'), + Some('n') => out.push('\n'), + Some(other) => { + out.push('\\'); + out.push(other); + } + None => out.push('\\'), + } + } else { + out.push(c); + } + } + out +} + +/// CL-style `|...|` pipe quoting unless the name is a valid bare identifier. +fn quote_name(name: &str) -> String { + if is_bare_name(name) { + name.to_string() + } else { + format!("|{}|", escape_pipe(name)) + } +} + +/// Pipe-quote unconditionally (for text cell values that must be distinguished +/// from numbers). +fn pipe_quote(s: &str) -> String { + format!("|{}|", escape_pipe(s)) +} + +// ── Number formatting ──────────────────────────────────────────────────────── + +fn format_number(n: f64) -> String { + if n.is_infinite() { + return if n.is_sign_positive() { + "inf".to_string() + } else { + "-inf".to_string() + }; + } + if n.is_nan() { + return "nan".to_string(); + } + if n.fract() == 0.0 && n.abs() < 1e15 { + format!("{}", n as i64) + } else { + let display = format!("{n}"); + if display.parse::() == Ok(n) { + display + } else { + format!("{n:?}") + } + } +} + +// ── File I/O ───────────────────────────────────────────────────────────────── + +fn is_gzip(path: &Path) -> bool { + path.to_str().is_some_and(|s| s.ends_with(".gz")) +} + pub fn save(model: &Model, path: &Path) -> Result<()> { let text = format_md(model); - - if path.to_str().map(|s| s.ends_with(".gz")).unwrap_or(false) { + if is_gzip(path) { let file = std::fs::File::create(path) .with_context(|| format!("Cannot create {}", path.display()))?; - let mut encoder = GzEncoder::new(BufWriter::new(file), Compression::default()); - encoder.write_all(text.as_bytes())?; - encoder.finish()?; + let mut enc = GzEncoder::new(BufWriter::new(file), Compression::default()); + enc.write_all(text.as_bytes())?; + enc.finish()?; } else { std::fs::write(path, &text).with_context(|| format!("Cannot write {}", path.display()))?; } @@ -29,18 +129,15 @@ pub fn save(model: &Model, path: &Path) -> Result<()> { pub fn load(path: &Path) -> Result { let file = std::fs::File::open(path).with_context(|| format!("Cannot open {}", path.display()))?; - - let text = if path.to_str().map(|s| s.ends_with(".gz")).unwrap_or(false) { - let mut decoder = GzDecoder::new(BufReader::new(file)); + let text = if is_gzip(path) { let mut s = String::new(); - decoder.read_to_string(&mut s)?; + GzDecoder::new(BufReader::new(file)).read_to_string(&mut s)?; s } else { let mut s = String::new(); BufReader::new(file).read_to_string(&mut s)?; s }; - if text.trim_start().starts_with('{') { serde_json::from_str(&text).context("Failed to deserialize model") } else { @@ -56,149 +153,97 @@ pub fn autosave_path(path: &Path) -> std::path::PathBuf { } -/// Format a number with enough precision for lossless round-trip. -fn format_number(n: f64) -> String { - if n.fract() == 0.0 && n.abs() < 1e15 { - format!("{}", n as i64) - } else { - // Use enough decimal digits to round-trip any f64. - // Rust's {:?} (Debug) uses full precision, but looks odd. - // Instead, try the default Display first; if it round-trips, use it. - let display = format!("{n}"); - if display.parse::() == Ok(n) { - display - } else { - // Fall back to repr-style full precision - format!("{n:?}") - } - } -} - -/// Characters that require pipe-quoting in a name. -const NAME_SPECIAL: &[char] = &['=', ',', '|', '[', ']', '/', ':', '#']; - -/// Format a name using CL-style `|...|` pipe quoting if it contains special -/// characters. Inside a quoted name, `\|` is a literal pipe and `\\` is a -/// literal backslash. -fn quote_name(name: &str) -> String { - if name.is_empty() || name.chars().any(|c| NAME_SPECIAL.contains(&c)) || name != name.trim() { - let mut out = String::with_capacity(name.len() + 2); - out.push('|'); - for c in name.chars() { - match c { - '|' => out.push_str("\\|"), - '\\' => out.push_str("\\\\"), - c => out.push(c), - } - } - out.push('|'); - out - } else { - name.to_string() - } -} - - /// Serialize a model to the markdown `.improv` format. pub fn format_md(model: &Model) -> String { - use std::fmt::Write; + // writeln! to a String is infallible; this macro avoids .unwrap() noise. + macro_rules! w { + ($dst:expr, $($arg:tt)*) => { { use std::fmt::Write; writeln!($dst, $($arg)*).ok(); } } + } + let mut out = String::new(); - writeln!(out, "v2025-04-09").unwrap(); - writeln!(out, "# {}", model.name).unwrap(); - writeln!(out, "Initial View: {}", model.active_view).unwrap(); + w!(out, "v2025-04-09"); + w!(out, "# {}", model.name); + w!(out, "Initial View: {}", model.active_view); - // Categories + // ── Views (first: typically small, orients the reader) ─────────── + for (_view_name, view) in &model.views { + w!(out, "\n## View: {}", view.name); + for (cat, axis) in &view.category_axes { + let qcat = quote_name(cat); + if *axis == Axis::Page { + if let Some(sel) = view.page_selections.get(cat) { + w!(out, "{qcat}: page, {}", quote_name(sel)); + continue; + } + } + let axis_str = match axis { + Axis::Row => "row", + Axis::Column => "column", + Axis::Page => "page", + Axis::None => "none", + }; + w!(out, "{qcat}: {axis_str}"); + } + if !view.number_format.is_empty() { + w!(out, "format: {}", view.number_format); + } + for (prefix, map) in [("hidden", &view.hidden_items), ("collapsed", &view.collapsed_groups)] + { + let mut pairs: Vec<_> = map + .iter() + .flat_map(|(cat, items)| items.iter().map(move |item| (cat.as_str(), item.as_str()))) + .collect(); + pairs.sort(); + for (cat, item) in pairs { + w!(out, "{prefix}: {}/{}", quote_name(cat), quote_name(item)); + } + } + } + + // ── Formulas ───────────────────────────────────────────────────── + if !model.formulas().is_empty() { + w!(out, "\n## Formulas"); + for f in model.formulas() { + w!(out, "- {} [{}]", f.raw, f.target_category); + } + } + + // ── Categories (items comma-separated on one line) ─────────────── for cat in model.categories.values() { - writeln!(out, "\n## Category: {}", cat.name).unwrap(); + w!(out, "\n## Category: {}", cat.name); + let mut bare: Vec = Vec::new(); + let mut grouped: Vec = Vec::new(); for item in cat.items.values() { match &item.group { - Some(g) => writeln!(out, "- {}[{}]", quote_name(&item.name), quote_name(g)) - .unwrap(), - None => writeln!(out, "- {}", quote_name(&item.name)).unwrap(), + Some(g) => grouped.push(format!("{}[{}]", quote_name(&item.name), quote_name(g))), + None => bare.push(quote_name(&item.name)), } } - // Group hierarchy: lines starting with `>` for groups that have a parent + if !bare.is_empty() { + w!(out, "- {}", bare.join(", ")); + } + for g_item in &grouped { + w!(out, "- {g_item}"); + } for g in &cat.groups { if let Some(parent) = &g.parent { - writeln!(out, "> {}[{}]", quote_name(&g.name), quote_name(parent)).unwrap(); + w!(out, "> {}[{}]", quote_name(&g.name), quote_name(parent)); } } } - // Formulas - if !model.formulas().is_empty() { - writeln!(out, "\n## Formulas").unwrap(); - for f in model.formulas() { - writeln!(out, "- {} [{}]", f.raw, f.target_category).unwrap(); - } - } - - // Data — sorted by coordinate string for deterministic diffs + // ── Data (last: typically the largest section) ──────────────────── let mut cells: Vec<_> = model.data.iter_cells().collect(); cells.sort_by_key(|(k, _)| coord_str(k)); if !cells.is_empty() { - writeln!(out, "\n## Data").unwrap(); + w!(out, "\n## Data"); for (key, value) in cells { let val_str = match value { CellValue::Number(n) => format_number(*n), - // Always pipe-quote text values to distinguish from numbers - CellValue::Text(s) | CellValue::Error(s) => { - let mut out = String::with_capacity(s.len() + 2); - out.push('|'); - for c in s.chars() { - match c { - '|' => out.push_str("\\|"), - '\\' => out.push_str("\\\\"), - '\n' => out.push_str("\\n"), - c => out.push(c), - } - } - out.push('|'); - out - } + CellValue::Text(s) | CellValue::Error(s) => pipe_quote(s), }; - writeln!(out, "{} = {}", coord_str(&key), val_str).unwrap(); - } - } - - // Views - for (_view_name, view) in &model.views { - writeln!(out, "\n## View: {}", view.name).unwrap(); - for (cat, axis) in &view.category_axes { - let qcat = quote_name(cat); - match axis { - Axis::Row => writeln!(out, "{qcat}: row").unwrap(), - Axis::Column => writeln!(out, "{qcat}: column").unwrap(), - Axis::Page => match view.page_selections.get(cat) { - Some(sel) => writeln!(out, "{qcat}: page, {}", quote_name(sel)).unwrap(), - None => writeln!(out, "{qcat}: page").unwrap(), - }, - Axis::None => writeln!(out, "{qcat}: none").unwrap(), - } - } - if !view.number_format.is_empty() { - writeln!(out, "format: {}", view.number_format).unwrap(); - } - // Hidden items (sorted for deterministic diffs) - let mut hidden: Vec<(&str, &str)> = view - .hidden_items - .iter() - .flat_map(|(cat, items)| items.iter().map(move |item| (cat.as_str(), item.as_str()))) - .collect(); - hidden.sort(); - for (cat, item) in hidden { - writeln!(out, "hidden: {}/{}", quote_name(cat), quote_name(item)).unwrap(); - } - // Collapsed groups (sorted for deterministic diffs) - let mut collapsed: Vec<(&str, &str)> = view - .collapsed_groups - .iter() - .flat_map(|(cat, gs)| gs.iter().map(move |g| (cat.as_str(), g.as_str()))) - .collect(); - collapsed.sort(); - for (cat, group) in collapsed { - writeln!(out, "collapsed: {}/{}", quote_name(cat), quote_name(group)).unwrap(); + w!(out, "{} = {}", coord_str(&key), val_str); } } @@ -206,20 +251,26 @@ pub fn format_md(model: &Model) -> String { } -/// Parse the markdown `.improv` format into a Model. +/// Parse the `.improv` format into a Model using the pest grammar. /// -/// Uses a two-pass approach so the file is order-independent: -/// pass 1 collects raw data, pass 2 builds the model with categories -/// registered before views are configured. +/// Sections may appear in any order; a two-pass approach registers categories +/// before configuring views. pub fn parse_md(text: &str) -> Result { - // ── Intermediate types ──────────────────────────────────────────────────── + use anyhow::bail; + use pest::iterators::{Pair, Pairs}; + + let file = ImprovParser::parse(Rule::file, text) + .map_err(|e| anyhow::anyhow!("Parse error: {e}"))? + .next() + .ok_or_else(|| anyhow::anyhow!("Empty parse result"))?; + + // ── Intermediate collectors ────────────────────────────────────────────── struct PCategory { name: String, - items: Vec<(String, Option)>, // (name, group) - group_parents: Vec<(String, String)>, // (group, parent) + items: Vec<(String, Option)>, + group_parents: Vec<(String, String)>, } - struct PView { name: String, axes: Vec<(String, Axis)>, @@ -229,152 +280,190 @@ pub fn parse_md(text: &str) -> Result { collapsed: Vec<(String, String)>, } - // ── Pass 1: collect ─────────────────────────────────────────────────────── - - #[derive(PartialEq)] - enum Section { - None, - Category, - Formulas, - Data, - View, - } - let mut model_name: Option = None; let mut initial_view: Option = None; let mut categories: Vec = Vec::new(); - let mut formulas: Vec<(String, String)> = Vec::new(); // (raw, category) + let mut formulas: Vec<(String, String)> = Vec::new(); let mut data: Vec<(CellKey, CellValue)> = Vec::new(); let mut views: Vec = Vec::new(); - let mut section = Section::None; - for line in text.lines() { - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - // Skip version line - if trimmed.starts_with('v') && trimmed.len() <= 20 && trimmed.contains('-') { - continue; - } + // ── Helpers for walking the pest parse tree ────────────────────────────── - if let Some(rest) = trimmed.strip_prefix("Initial View: ") { - initial_view = Some(rest.trim().to_string()); - continue; - } + /// Advance an iterator, returning an error if empty. + fn next<'a>(pairs: &mut Pairs<'a, Rule>, ctx: &str) -> Result> { + pairs + .next() + .ok_or_else(|| anyhow::anyhow!("Expected child in {ctx}")) + } - if trimmed.starts_with("# ") && !trimmed.starts_with("## ") { - model_name = Some(trimmed[2..].trim().to_string()); - continue; - } - if let Some(rest) = trimmed.strip_prefix("## Category: ") { - categories.push(PCategory { - name: rest.trim().to_string(), - items: Vec::new(), - group_parents: Vec::new(), - }); - section = Section::Category; - continue; - } - if trimmed == "## Formulas" { - section = Section::Formulas; - continue; - } - if trimmed == "## Data" { - section = Section::Data; - continue; - } - if let Some(rest) = trimmed.strip_prefix("## View: ") { - let name = rest.trim().to_string(); - views.push(PView { - name, - axes: Vec::new(), - page_selections: Vec::new(), - format: String::new(), - hidden: Vec::new(), - collapsed: Vec::new(), - }); - section = Section::View; - continue; - } - if trimmed.starts_with("## ") { - continue; - } + /// Extract the first child's text content, trimmed. + fn first_str(pair: Pair<'_, Rule>) -> Result { + Ok(next(&mut pair.into_inner(), "first_str")? + .as_str() + .trim() + .to_string()) + } - match section { - Section::Category => { - let Some(cat) = categories.last_mut() else { - continue; - }; - if let Some(rest) = trimmed.strip_prefix("- ") { - let (name, group) = parse_bracketed(rest); - cat.items.push((name, group)); - } else if let Some(rest) = trimmed.strip_prefix("> ") { - let (group, parent) = parse_bracketed(rest); - if let Some(p) = parent { - cat.group_parents.push((group, p)); - } - } + fn extract_name(pair: Pair<'_, Rule>) -> Result { + match pair.as_rule() { + Rule::bare_name => Ok(pair.as_str().to_string()), + Rule::pipe_quoted => { + let inner = next(&mut pair.into_inner(), "pipe_quoted")?; + Ok(unescape_pipe(inner.as_str())) } - Section::Formulas => { - if let Some(rest) = trimmed.strip_prefix("- ") { - let (raw, cat) = parse_bracketed(rest); - if let Some(c) = cat { - formulas.push((raw.to_string(), c.to_string())); - } - } - } - Section::Data => { - let Some((coords, value)) = parse_data_line(trimmed) else { - continue; - }; - data.push((CellKey::new(coords), value)); - } - Section::View => { - let Some(view) = views.last_mut() else { - continue; - }; - if let Some(fmt) = trimmed.strip_prefix("format: ") { - view.format = fmt.trim().to_string(); - } else if let Some(rest) = trimmed.strip_prefix("hidden: ") { - if let Some((c, i)) = parse_slash_path(rest.trim()) { - view.hidden.push((c, i)); - } - } else if let Some(rest) = trimmed.strip_prefix("collapsed: ") { - if let Some((c, g)) = parse_slash_path(rest.trim()) { - view.collapsed.push((c, g)); - } - } else if let Some((cat, rest)) = parse_name_colon(trimmed) { - if let Some(sel_rest) = rest.strip_prefix("page") { - view.axes.push((cat.clone(), Axis::Page)); - if let Some(sel) = sel_rest.strip_prefix(", ") { - let sel = parse_inline_name(sel.trim()); - view.page_selections.push((cat, sel)); - } - } else { - let axis = match rest { - "row" => Axis::Row, - "column" => Axis::Column, - "none" => Axis::None, - _ => continue, - }; - view.axes.push((cat, axis)); - } - } - } - Section::None => {} + _ => Ok(pair.as_str().to_string()), } } - // ── Pass 2: build ───────────────────────────────────────────────────────── + /// Extract two names from a pair's children. + fn extract_name_pair(pair: Pair<'_, Rule>) -> Result<(String, String)> { + let ctx = format!("{:?}", pair.as_rule()); + let mut parts = pair.into_inner(); + let a = extract_name(next(&mut parts, &ctx)?)?; + let b = extract_name(next(&mut parts, &ctx)?)?; + Ok((a, b)) + } + + // ── Pass 1: walk the parse tree ───────────────────────────────────────── + + for pair in file.into_inner() { + match pair.as_rule() { + Rule::version_line | Rule::EOI => {} + Rule::model_name => { + model_name = Some(first_str(pair)?); + } + Rule::initial_view => { + initial_view = Some(first_str(pair)?); + } + Rule::category_section => { + let mut inner = pair.into_inner(); + let cname = next(&mut inner, "category_section")?.as_str().trim().to_string(); + let mut pc = PCategory { + name: cname, + items: Vec::new(), + group_parents: Vec::new(), + }; + for entry in inner { + match entry.as_rule() { + Rule::item_list => { + for name_pair in entry.into_inner() { + pc.items.push((extract_name(name_pair)?, None)); + } + } + Rule::grouped_item => { + let (name, group) = extract_name_pair(entry)?; + pc.items.push((name, Some(group))); + } + Rule::group_hierarchy => { + pc.group_parents.push(extract_name_pair(entry)?); + } + _ => {} + } + } + categories.push(pc); + } + Rule::formulas_section => { + for fl in pair.into_inner() { + if fl.as_rule() == Rule::formula_line { + let raw = first_str(fl)?; + if let Some(i) = raw.rfind(" [") { + if raw.ends_with(']') { + formulas.push(( + raw[..i].to_string(), + raw[i + 2..raw.len() - 1].to_string(), + )); + } + } + } + } + } + Rule::data_section => { + for dl in pair.into_inner() { + if dl.as_rule() == Rule::data_line { + let mut dl_inner = dl.into_inner(); + let coord_list = next(&mut dl_inner, "data_line coords")?; + let value_pair = next(&mut dl_inner, "data_line value")?; + + let coords: Vec<_> = coord_list + .into_inner() + .filter(|p| p.as_rule() == Rule::coord) + .map(extract_name_pair) + .collect::>()?; + + let value = match value_pair.as_rule() { + Rule::number => { + CellValue::Number(value_pair.as_str().parse().unwrap_or(0.0)) + } + Rule::pipe_quoted => { + let inner = next(&mut value_pair.into_inner(), "pipe_quoted")?; + CellValue::Text(unescape_pipe(inner.as_str())) + } + Rule::bare_value => match value_pair.as_str().trim() { + "inf" => CellValue::Number(f64::INFINITY), + "-inf" => CellValue::Number(f64::NEG_INFINITY), + "nan" => CellValue::Number(f64::NAN), + s => CellValue::Text(s.to_string()), + }, + _ => CellValue::Text(value_pair.as_str().to_string()), + }; + + data.push((CellKey::new(coords), value)); + } + } + } + Rule::view_section => { + let mut inner = pair.into_inner(); + let vname = next(&mut inner, "view_section")?.as_str().trim().to_string(); + let mut pv = PView { + name: vname, + axes: Vec::new(), + page_selections: Vec::new(), + format: String::new(), + hidden: Vec::new(), + collapsed: Vec::new(), + }; + for entry in inner { + match entry.as_rule() { + Rule::axis_line => { + let mut parts = entry.into_inner(); + let cat = extract_name(next(&mut parts, "axis cat")?)?; + let kind_str = next(&mut parts, "axis kind")?.as_str(); + let axis = match kind_str { + "row" => Axis::Row, + "column" => Axis::Column, + "page" => Axis::Page, + "none" => Axis::None, + _ => bail!("Unknown axis kind: {kind_str}"), + }; + pv.axes.push((cat.clone(), axis)); + if axis == Axis::Page { + if let Some(sel_pair) = parts.next() { + pv.page_selections.push((cat, extract_name(sel_pair)?)); + } + } + } + Rule::format_line => pv.format = first_str(entry)?, + Rule::hidden_line => pv.hidden.push(extract_name_pair(entry)?), + Rule::collapsed_line => pv.collapsed.push(extract_name_pair(entry)?), + _ => {} + } + } + views.push(pv); + } + _ => {} + } + } + + // ── Pass 2: build the Model ───────────────────────────────────────────── let name = model_name.ok_or_else(|| anyhow::anyhow!("Missing model title (# Name)"))?; let mut m = Model::new(&name); - // Categories first — registers them with all existing views via on_category_added for pc in &categories { m.add_category(&pc.name)?; - let cat = m.category_mut(&pc.name).unwrap(); + let cat = m + .category_mut(&pc.name) + .ok_or_else(|| anyhow::anyhow!("Category '{}' not found after add", pc.name))?; for (item_name, group) in &pc.items { match group { Some(g) => { @@ -396,12 +485,14 @@ pub fn parse_md(text: &str) -> Result { } } - // Views — all categories are now registered, so set_axis works correctly for pv in &views { if !m.views.contains_key(&pv.name) { m.create_view(&pv.name); } - let view = m.views.get_mut(&pv.name).unwrap(); + let view = m + .views + .get_mut(&pv.name) + .ok_or_else(|| anyhow::anyhow!("View '{}' not found after create", pv.name))?; for (cat, axis) in &pv.axes { view.set_axis(cat, *axis); } @@ -419,14 +510,12 @@ pub fn parse_md(text: &str) -> Result { } } - // Set initial view if specified if let Some(iv) = &initial_view { if m.views.contains_key(iv) { m.active_view = iv.clone(); } } - // Formulas and data can go in any order relative to each other for (raw, cat_name) in &formulas { m.add_formula(parse_formula(raw, cat_name).with_context(|| format!("Formula: {raw}"))?); } @@ -437,74 +526,6 @@ pub fn parse_md(text: &str) -> Result { Ok(m) } -/// Parse `"Name[Group]"` or `"|Name|[|Group|]"` or `"Name"`. -/// Returns (name_str, optional_group_str). Both may be pipe-quoted. -fn parse_bracketed(s: &str) -> (String, Option) { - let s = s.trim(); - // Parse the name part (possibly pipe-quoted) - let (name, rest) = if s.starts_with('|') { - match parse_maybe_quoted_name(s) { - Some((n, r)) => (n, r), - None => return (s.to_string(), None), - } - } else { - // Bare name: everything before `[` (if any) or end - match s.find('[') { - Some(i) => (s[..i].to_string(), &s[i..]), - None => return (s.to_string(), None), - } - }; - - // Check for [group] suffix - let rest = rest.trim(); - if rest.starts_with('[') && rest.ends_with(']') { - let inner = &rest[1..rest.len() - 1]; - let group = parse_inline_name(inner); - (name, Some(group)) - } else { - (name, None) - } -} - -/// Parse a `name/name` path where names may be pipe-quoted. -fn parse_slash_path(s: &str) -> Option<(String, String)> { - if s.starts_with('|') { - // First name is pipe-quoted — find closing pipe, then expect / - let (name, rest) = parse_maybe_quoted_name(s)?; - let rest = rest.strip_prefix('/')?; - let item = parse_inline_name(rest); - Some((name, item)) - } else { - let (head, tail) = s.split_once('/')?; - Some((head.trim().to_string(), parse_inline_name(tail.trim()))) - } -} - -/// Parse a `name: rest` line where name may be pipe-quoted. -fn parse_name_colon(s: &str) -> Option<(String, &str)> { - if s.starts_with('|') { - let (name, rest) = parse_maybe_quoted_name(s)?; - let rest = rest.strip_prefix(": ")?; - Some((name, rest)) - } else { - let colon = s.find(": ")?; - let name = s[..colon].trim().to_string(); - let rest = s[colon + 2..].trim(); - Some((name, rest)) - } -} - -/// Parse a single name that may be pipe-quoted. Returns the unquoted string. -fn parse_inline_name(s: &str) -> String { - let s = s.trim(); - if s.starts_with('|') { - if let Some((name, _)) = parse_maybe_quoted_name(s) { - return name; - } - } - s.to_string() -} - fn coord_str(key: &CellKey) -> String { key.0 .iter() @@ -513,110 +534,6 @@ fn coord_str(key: &CellKey) -> String { .join(", ") } -/// Parse a data line like `Cat=Item, Cat2=Item2 = "value"` into coordinates -/// and a cell value. Handles backtick-quoted names containing `=` or `, `. -fn parse_data_line(line: &str) -> Option<(Vec<(String, String)>, CellValue)> { - // Find the value separator: the last ` = ` that isn't inside a backtick-quoted name. - // Strategy: scan for ` = ` from the right, since the value is always at the end. - // But the value itself could contain ` = ` if it's a quoted text. - // The format is: coords ` = ` value - // where value is either a number or "quoted text". - // - // We find the separator by scanning from left: the first ` = ` that is NOT - // inside a backtick-quoted name is the separator. Since coordinates don't - // contain ` = ` (they use bare `=`), the first ` = ` is always the separator. - let sep = line.find(" = ")?; - let coord_part = &line[..sep]; - let value_part = line[sep + 3..].trim(); - - let coords = parse_coord_str(coord_part)?; - if coords.is_empty() { - return None; - } - - let value = if let Ok(n) = value_part.parse::() { - CellValue::Number(n) - } else { - // Text value — may be pipe-quoted or bare - CellValue::Text(parse_inline_name(value_part)) - }; - - Some((coords, value)) -} - -/// Parse a coordinate string like `Cat=Item, Cat2=Item2` into pairs. -/// Handles backtick-quoted names: `` `Income, Gross`=A ``. -fn parse_coord_str(s: &str) -> Option> { - let mut pairs = Vec::new(); - let mut rest = s.trim(); - - while !rest.is_empty() { - // Parse category name (possibly backtick-quoted) - let (cat, after_cat) = parse_maybe_quoted_name(rest)?; - let after_cat = after_cat.strip_prefix('=')?; - // Parse item name (possibly backtick-quoted) - let (item, after_item) = parse_maybe_quoted_name(after_cat)?; - - pairs.push((cat, item)); - - let after_item = after_item.trim_start(); - if after_item.is_empty() { - break; - } - // Expect ", " separator - rest = after_item.strip_prefix(", ")?; - } - - Some(pairs) -} - -/// Parse a name that may be pipe-quoted. Returns (name, rest_of_string). -/// Pipe-quoted: `|Income, Gross|` → `"Income, Gross"`. -/// Backslash escapes inside: `|\||` → `"|"`, `|\\|` → `"\"`, `|\n|` → newline. -/// Unquoted: stops at `=` or `, ` or end of string. -fn parse_maybe_quoted_name(s: &str) -> Option<(String, &str)> { - if let Some(inner) = s.strip_prefix('|') { - // Pipe-quoted name: scan for unescaped closing pipe - let mut name = String::new(); - let mut chars = inner.char_indices(); - while let Some((i, c)) = chars.next() { - if c == '\\' { - // Escape sequence - if let Some((_, next)) = chars.next() { - match next { - '|' => name.push('|'), - '\\' => name.push('\\'), - 'n' => name.push('\n'), - other => { - name.push('\\'); - name.push(other); - } - } - } - } else if c == '|' { - // End of quoted name - return Some((name, &inner[i + 1..])); - } else { - name.push(c); - } - } - // Unterminated pipe — treat whole thing as name - Some((name, "")) - } else { - // Unquoted: take chars until `=` or `, ` or end (whichever comes first) - let eq_pos = s.find('='); - let comma_pos = s.find(", "); - let end = match (eq_pos, comma_pos) { - (Some(a), Some(b)) => a.min(b), - (Some(a), None) => a, - (None, Some(b)) => b, - (None, None) => s.len(), - }; - let name = s[..end].trim().to_string(); - Some((name, &s[end..])) - } -} - pub fn export_csv(model: &Model, view_name: &str, path: &Path) -> Result<()> { let view = model .views @@ -711,10 +628,10 @@ mod tests { let m = two_cat_model(); let text = format_md(&m); assert!(text.contains("## Category: Type")); - assert!(text.contains("- Food")); - assert!(text.contains("- Gas")); + // Bare items are now comma-separated on one line + assert!(text.contains("- Food, Gas"), "expected comma-separated items:\n{text}"); assert!(text.contains("## Category: Month")); - assert!(text.contains("- Jan")); + assert!(text.contains("Jan")); } #[test] @@ -739,7 +656,7 @@ mod tests { .unwrap() .add_group(Group::new("Q1").with_parent("2025")); let text = format_md(&m); - assert!(text.contains("> Q1[2025]"), "got:\n{text}"); + assert!(text.contains("> Q1[|2025|]"), "got:\n{text}"); } #[test] @@ -930,7 +847,7 @@ mod tests { fn parse_md_order_independent_view_before_categories() { // A hand-edited file with the view section before the category sections. // The parser must still produce correct axis assignments. - let text = "# Test\n\ + let text = "v2025-04-09\n# Test\n\ ## View: Default\n\ Type: row\n\ Month: column\n\ @@ -945,8 +862,7 @@ mod tests { #[test] fn parse_md_order_independent_new_view_before_categories() { - // A non-Default view with swapped axes, declared before categories exist. - let text = "# Test\n\ + let text = "v2025-04-09\n# Test\n\ ## View: Transposed\n\ Type: column\n\ Month: row\n\ @@ -968,7 +884,7 @@ mod tests { #[test] fn parse_md_order_independent_data_before_categories() { - let text = "# Test\n\ + let text = "v2025-04-09\n# Test\n\ ## Data\n\ Month=Jan, Type=Food = 42\n\ ## Category: Type\n\ @@ -1094,11 +1010,11 @@ mod tests { #[test] fn parse_md_ignores_blank_and_comment_lines() { - let text = r#"# Test Model + let text = r#"v2025-04-09 +# Test Model ## Category: Type -- Food -- Gas +- Food, Gas ## Data Type=Food = 42 @@ -1945,42 +1861,36 @@ mod parser_edge_cases { #[test] fn parse_just_model_name() { - let m = parse_md("# MyModel\n").unwrap(); + let m = parse_md("v2025-04-09\n# MyModel\n").unwrap(); assert_eq!(m.name, "MyModel"); } #[test] fn parse_data_without_value() { - // Malformed data line: no " = " separator - let text = "# Test\n## Data\nType=Food\n"; - let m = parse_md(text).unwrap(); - // Should silently skip the malformed line - assert_eq!(m.data.iter_cells().count(), 0); + // Malformed data line: no " = " separator — pest rejects it + let text = "v2025-04-09\n# Test\n## Data\nType=Food\n"; + assert!(parse_md(text).is_err()); } #[test] fn parse_data_with_empty_coords() { - // Data line with only value, no coordinates - let text = "# Test\n## Data\n = 42\n"; - let m = parse_md(text).unwrap(); - assert_eq!(m.data.iter_cells().count(), 0); + // Data line with only value, no coordinates — pest rejects it + let text = "v2025-04-09\n# Test\n## Data\n = 42\n"; + assert!(parse_md(text).is_err()); } #[test] fn parse_duplicate_categories() { - // Two categories with the same name - let text = "# Test\n## Category: Type\n- A\n## Category: Type\n- B\n"; + let text = "v2025-04-09\n# Test\n## Category: Type\n- A\n## Category: Type\n- B\n"; let m = parse_md(text).unwrap(); let cat = m.category("Type").unwrap(); - // Second declaration should win or merge let item_names: Vec<&str> = cat.items.values().map(|i| i.name.as_str()).collect(); - // At minimum shouldn't panic assert!(!item_names.is_empty()); } #[test] fn parse_category_with_no_items() { - let text = "# Test\n## Category: Empty\n## Category: Full\n- A\n"; + let text = "v2025-04-09\n# Test\n## Category: Empty\n## Category: Full\n- A\n"; let m = parse_md(text).unwrap(); assert!(m.category("Empty").is_some()); assert_eq!(m.category("Empty").unwrap().items.len(), 0); @@ -2077,25 +1987,26 @@ mod parser_edge_cases { #[test] fn model_name_with_leading_trailing_spaces() { - let text = "# Spaced Model \n"; + let text = "v2025-04-09\n# Spaced Model \n"; let m = parse_md(text).unwrap(); + // rest_of_line captures everything after "# "; we trim in the builder assert_eq!(m.name, "Spaced Model"); } #[test] fn category_name_with_trailing_spaces() { - let text = "# Test\n## Category: Trailing \n- Item\n"; + let text = "v2025-04-09\n# Test\n## Category: Trailing \n- Item\n"; let m = parse_md(text).unwrap(); + // rest_of_line includes trailing spaces; we trim in the builder assert!(m.category("Trailing").is_some()); } #[test] fn data_line_with_extra_whitespace() { - let text = "# Test\n## Category: T\n- A\n## Category: M\n- J\n## Data\n T=A , M=J = 42 \n"; - let m = parse_md(text).unwrap(); - // Should handle extra whitespace gracefully - let count = m.data.iter_cells().count(); - assert!(count <= 1, "At most one cell should parse: got {count}"); + // With the pest grammar, extra whitespace in data lines is rejected + let text = "v2025-04-09\n# Test\n## Category: T\n- A\n## Category: M\n- J\n## Data\n T=A , M=J = 42 \n"; + // pest grammar is strict about whitespace — this should fail + assert!(parse_md(text).is_err()); } // ── Three-category model ──────────────────────────────────────────── @@ -2173,3 +2084,208 @@ mod parser_edge_cases { ); } } + +// ── Grammar-walking file generator ─────────────────────────────────────────── +// +// Parses `improv.pest` at test time and walks the AST to generate random valid +// files. The generator and parser share a single source of truth: the grammar. + +#[cfg(test)] +mod gen { + use pest_meta::ast::{Expr, RuleType}; + use pest_meta::parser; + use proptest::prelude::*; + use std::collections::HashMap; + + /// Parse the grammar file and return rules keyed by name. + fn load_grammar() -> HashMap { + let grammar = include_str!("improv.pest"); + let pairs = parser::parse(parser::Rule::grammar_rules, grammar) + .unwrap_or_else(|e| panic!("Bad grammar: {e}")); + let rules = parser::consume_rules(pairs).unwrap_or_else(|e| panic!("{e:?}")); + rules + .into_iter() + .map(|r| (r.name.clone(), (r.ty, r.expr))) + .collect() + } + + /// Recursive string generator driven by a pest `Expr`. + /// + /// `choices` is consumed left-to-right for every decision point (Choice, + /// Opt, Rep). If it runs out we pick the "smallest" alternative (first + /// branch, no repetition, skip optional). + struct Gen<'g> { + rules: &'g HashMap, + choices: Vec, + pos: usize, + } + + impl<'g> Gen<'g> { + fn new(rules: &'g HashMap, choices: Vec) -> Self { + Self { + rules, + choices, + pos: 0, + } + } + + /// Consume one byte of entropy, defaulting to 0. + fn pick(&mut self) -> u8 { + let v = self.choices.get(self.pos).copied().unwrap_or(0); + self.pos += 1; + v + } + + fn emit(&mut self, expr: &Expr, out: &mut String) { + match expr { + Expr::Str(s) => out.push_str(s), + Expr::Range(lo, hi) => { + let lo = lo.chars().next().unwrap() as u32; + let hi = hi.chars().next().unwrap() as u32; + let range = hi - lo + 1; + let ch = char::from_u32(lo + (self.pick() as u32 % range)).unwrap(); + out.push(ch); + } + Expr::Ident(name) => { + // Built-in pest rules + match name.as_str() { + "ANY" => { + let ch = (b'a' + self.pick() % 26) as char; + out.push(ch); + } + "NEWLINE" => out.push('\n'), + "SOI" | "EOI" => {} + "ASCII_DIGIT" => { + let d = (b'0' + self.pick() % 10) as char; + out.push(d); + } + _ => { + // Look up user-defined rule + if let Some((_ty, expr)) = self.rules.get(name) { + self.emit(expr, out); + } + } + } + } + Expr::Seq(a, b) => { + self.emit(a, out); + self.emit(b, out); + } + Expr::Choice(a, b) => { + // Collect all choices (right-associated) + let mut alts: Vec<&Expr> = vec![a.as_ref()]; + let mut cur = b.as_ref(); + while let Expr::Choice(l, r) = cur { + alts.push(l.as_ref()); + cur = r.as_ref(); + } + alts.push(cur); + let idx = self.pick() as usize % alts.len(); + self.emit(alts[idx], out); + } + Expr::Opt(inner) => { + if self.pick() % 3 != 0 { + // ~66% chance of emitting + self.emit(inner, out); + } + } + Expr::Rep(inner) => { + // 0..N repetitions + let count = self.pick() % 4; + for _ in 0..count { + self.emit(inner, out); + } + } + Expr::RepOnce(inner) => { + // 1..N repetitions + let count = 1 + self.pick() % 3; + for _ in 0..count { + self.emit(inner, out); + } + } + Expr::NegPred(_) | Expr::PosPred(_) => { + // Lookaheads don't produce output + } + _ => { + // Skip unsupported expressions + } + } + } + + fn generate(&mut self, rule_name: &str) -> String { + let mut out = String::new(); + if let Some((_ty, expr)) = self.rules.get(rule_name).cloned() { + self.emit(&expr, &mut out); + } + out + } + } + + /// Proptest strategy: generate a valid `.improv` file by walking the grammar. + pub fn improv_file() -> impl Strategy { + // Use random bytes as entropy for choices in the grammar walk + prop::collection::vec(any::(), 64..=256).prop_map(|choices| { + let rules = load_grammar(); + let mut gen = Gen::new(&rules, choices); + gen.generate("file") + }) + } +} + +#[cfg(test)] +mod grammar_prop_tests { + use super::{format_md, gen, parse_md}; + use proptest::prelude::*; + + proptest! { + #![proptest_config(ProptestConfig::with_cases(500))] + + /// parse(generate()) — every generated file parses without error. + #[test] + fn generated_file_parses(file in gen::improv_file()) { + let result = parse_md(&file); + prop_assert!(result.is_ok(), + "Generated file failed to parse:\n{}\nError: {}", + file, result.unwrap_err()); + } + + /// parse(print(parse(generate()))) — round-trip through format is stable. + #[test] + fn generated_file_roundtrips(file in gen::improv_file()) { + let result1 = parse_md(&file); + // Skip inputs that don't parse (the grammar walk may produce + // degenerate inputs like empty model names) + prop_assume!(result1.is_ok()); + let model1 = result1.unwrap(); + let printed = format_md(&model1); + let model2_result = parse_md(&printed); + prop_assert!(model2_result.is_ok(), + "Re-formatted file failed to parse:\n{}\nError: {}", + printed, model2_result.unwrap_err()); + + let model2 = model2_result.unwrap(); + + // Model name preserved + prop_assert_eq!(&model1.name, &model2.name); + + // Category count preserved + prop_assert_eq!( + model1.categories.len(), + model2.categories.len(), + "Category count changed" + ); + + // Cell count preserved + let count1 = model1.data.iter_cells().count(); + let count2 = model2.data.iter_cells().count(); + prop_assert_eq!(count1, count2, + "Cell count changed: {} → {}\nOriginal:\n{}\nRe-formatted:\n{}", + count1, count2, file, printed); + + // Double round-trip: format(parse(format(parse(gen)))) == format(parse(gen)) + let printed2 = format_md(&model2); + prop_assert_eq!(&printed, &printed2, + "format→parse→format not idempotent"); + } + } +}