refactor(io): move persistence and import into improvise-io (improvise-8zh)

Relocate the two I/O module trees into the improvise-io sub-crate
scaffolded in the previous commit:

  git mv src/persistence -> crates/improvise-io/src/persistence
  git mv src/import      -> crates/improvise-io/src/import

The grammar file `improv.pest` moves alongside `persistence/mod.rs`;
the `#[grammar = "persistence/improv.pest"]` attribute resolves relative
to the new crate root and keeps working unchanged.

No path edits inside the moved code: the `crate::model::*`,
`crate::view::*`, `crate::workbook::*`, `crate::format::*`, and
`crate::formula::*` imports inside persistence and import all continue
to resolve because improvise-io's lib.rs re-exports those modules from
improvise-core and improvise-formula, mirroring the pattern improvise-core
uses for `formula`. Verified no `crate::ui::*`, `crate::command::*`,
`crate::draw::*` imports exist in the moved code (per improvise-8zh
acceptance criterion #3).

Main-crate `src/lib.rs` now re-exports `import` and `persistence` from
improvise-io, keeping every `crate::persistence::*` and `crate::import::*`
path in the 4 consumer files (ui/app.rs, ui/effect.rs,
ui/import_wizard_ui.rs, main.rs) resolving unchanged — no downstream
edits needed.

`examples/gen-grammar.rs` had `include_str!("../src/persistence/improv.pest")`;
updated the relative path to the new location under
`crates/improvise-io/src/persistence/`.

Verification:
- cargo check --workspace --examples: clean
- cargo test --workspace: 616 passing (219 main + 190 core + 65 formula + 142 io)
- cargo clippy --workspace --tests: clean
- cargo build -p improvise-io: standalone build succeeds, confirming no
  UI/command leakage into the IO crate (improvise-8zh acceptance #2, #3)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Edward Langley
2026-04-15 23:08:00 -07:00
parent bd17aed169
commit 5807464fc7
9 changed files with 7 additions and 5 deletions

View File

@ -0,0 +1,292 @@
use chrono::{Datelike, NaiveDate};
use serde_json::Value;
use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq)]
pub enum FieldKind {
/// Small number of distinct string values → dimension/category
Category,
/// Numeric values → measure
Measure,
/// Date/time strings → time category
TimeCategory,
/// Many unique strings (IDs, names) → label/identifier
Label,
}
/// Date components that can be extracted from a date field.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DateComponent {
Year,
Month,
Quarter,
}
#[derive(Debug, Clone)]
pub struct FieldProposal {
pub field: String,
pub kind: FieldKind,
pub distinct_values: Vec<String>,
pub accepted: bool,
/// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory.
pub date_format: Option<String>,
/// Which date components to extract as new categories.
pub date_components: Vec<DateComponent>,
}
impl FieldProposal {
pub fn kind_label(&self) -> &'static str {
match self.kind {
FieldKind::Category => "Category (dimension)",
FieldKind::Measure => "Measure (numeric)",
FieldKind::TimeCategory => "Time Category",
FieldKind::Label => "Label (per-row, drill-view only)",
}
}
}
/// Common date formats to try, in order of preference.
const DATE_FORMATS: &[&str] = &[
"%Y-%m-%d", // 2025-04-02
"%m/%d/%Y", // 04/02/2025
"%m/%d/%y", // 04/02/25
"%d/%m/%Y", // 02/04/2025
"%Y%m%d", // 20250402
"%b %d, %Y", // Apr 02, 2025
"%B %d, %Y", // April 02, 2025
"%d-%b-%Y", // 02-Apr-2025
];
/// Try to detect a chrono date format from sample values.
/// Returns the first format that successfully parses all non-empty samples.
pub fn detect_date_format(samples: &[&str]) -> Option<String> {
let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect();
if samples.is_empty() {
return None;
}
// Try up to 10 samples for efficiency
let test_samples: Vec<&str> = samples.into_iter().take(10).collect();
for fmt in DATE_FORMATS {
if test_samples
.iter()
.all(|s| NaiveDate::parse_from_str(s, fmt).is_ok())
{
return Some(fmt.to_string());
}
}
None
}
/// Parse a date string and extract a component value.
pub fn extract_date_component(
value: &str,
format: &str,
component: DateComponent,
) -> Option<String> {
let date = NaiveDate::parse_from_str(value, format).ok()?;
Some(match component {
DateComponent::Year => format!("{}", date.format("%Y")),
DateComponent::Month => format!("{}", date.format("%Y-%m")),
DateComponent::Quarter => {
let q = (date.month0() / 3) + 1;
format!("{}-Q{}", date.format("%Y"), q)
}
})
}
const CATEGORY_THRESHOLD: usize = 20;
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
if records.is_empty() {
return vec![];
}
// Collect all field names
let mut fields: Vec<String> = Vec::new();
for record in records {
if let Value::Object(map) = record {
for key in map.keys() {
if !fields.contains(key) {
fields.push(key.clone());
}
}
}
}
fields
.into_iter()
.map(|field| {
let values: Vec<&Value> = records.iter().filter_map(|r| r.get(&field)).collect();
let all_numeric = values.iter().all(|v| v.is_number());
let all_string = values.iter().all(|v| v.is_string());
if all_numeric {
return FieldProposal {
field,
kind: FieldKind::Measure,
distinct_values: vec![],
accepted: true,
date_format: None,
date_components: vec![],
};
}
if all_string {
let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect();
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
let n = distinct_vec.len();
// Try chrono-based date detection
let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect();
let date_format = detect_date_format(&samples);
if date_format.is_some() {
return FieldProposal {
field,
kind: FieldKind::TimeCategory,
distinct_values: distinct_vec,
accepted: true,
date_format,
date_components: vec![],
};
}
if n <= CATEGORY_THRESHOLD {
return FieldProposal {
field,
kind: FieldKind::Category,
distinct_values: distinct_vec,
accepted: true,
date_format: None,
date_components: vec![],
};
}
return FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: distinct_vec,
accepted: true,
date_format: None,
date_components: vec![],
};
}
// Mixed or other: treat as label
FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: vec![],
accepted: true,
date_format: None,
date_components: vec![],
}
})
.collect()
}
/// Extract nested array from JSON by dot-path
pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec<Value>> {
if path.is_empty() {
return value.as_array();
}
let mut current = value;
for part in path.split('.') {
current = current.get(part)?;
}
current.as_array()
}
/// Find candidate paths to arrays in JSON
pub fn find_array_paths(value: &Value) -> Vec<String> {
let mut paths = Vec::new();
find_array_paths_inner(value, "", &mut paths);
paths
}
fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>) {
match value {
Value::Array(_) => {
paths.push(prefix.to_string());
}
Value::Object(map) => {
for (key, val) in map {
let path = if prefix.is_empty() {
key.clone()
} else {
format!("{prefix}.{key}")
};
find_array_paths_inner(val, &path, paths);
}
}
_ => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_iso_date_format() {
let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"];
assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string()));
}
#[test]
fn detect_us_date_format() {
let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"];
assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string()));
}
#[test]
fn detect_short_year_format() {
// Two-digit years are ambiguous with four-digit format, so %m/%d/%Y
// matches first. This is expected — the user can override in the wizard.
let samples = vec!["03/31/26", "01/15/25"];
assert!(detect_date_format(&samples).is_some());
}
#[test]
fn detect_no_date_format() {
let samples = vec!["hello", "world"];
assert_eq!(detect_date_format(&samples), None);
}
#[test]
fn extract_year_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year);
assert_eq!(result, Some("2026".to_string()));
}
#[test]
fn extract_month_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month);
assert_eq!(result, Some("2026-03".to_string()));
}
#[test]
fn extract_quarter_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2026-Q1".to_string()));
}
#[test]
fn extract_quarter_q4() {
let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2025-Q4".to_string()));
}
#[test]
fn analyze_detects_time_category_with_format() {
let records: Vec<Value> = vec![
serde_json::json!({"Date": "01/15/2025", "Amount": 100}),
serde_json::json!({"Date": "02/20/2025", "Amount": 200}),
];
let proposals = analyze_records(&records);
let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap();
assert_eq!(date_prop.kind, FieldKind::TimeCategory);
assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string()));
}
}

View File

@ -0,0 +1,300 @@
use std::path::Path;
use anyhow::{Context, Result};
use csv::ReaderBuilder;
use serde_json::Value;
pub fn csv_path_p(path: &Path) -> bool {
path.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("csv"))
}
/// Parse a CSV file and return records as serde_json::Value array
pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
let mut reader = ReaderBuilder::new()
.has_headers(true)
.flexible(true)
.trim(csv::Trim::All)
.from_path(path)
.with_context(|| format!("Failed to open CSV file: {}", path.display()))?;
// Detect if first row looks like headers (strings) or data (mixed)
let has_headers = reader.headers().is_ok();
let mut records = Vec::new();
let mut headers = Vec::new();
if has_headers {
headers = reader
.headers()
.with_context(|| "Failed to read CSV headers")?
.iter()
.map(|s| s.to_string())
.collect();
}
for result in reader.records() {
let record = result.with_context(|| "Failed to read CSV record")?;
let mut map = serde_json::Map::new();
for (i, field) in record.iter().enumerate() {
let json_value: Value = parse_csv_field(field);
if has_headers {
if let Some(header) = headers.get(i) {
map.insert(header.clone(), json_value);
}
} else {
map.insert(i.to_string(), json_value);
}
}
if !map.is_empty() {
records.push(Value::Object(map));
}
}
Ok(records)
}
/// Parse multiple CSV files and merge into a single JSON array.
/// Each record gets a "File" field set to the filename stem (e.g., "sales" from "sales.csv").
pub fn merge_csvs(paths: &[impl AsRef<Path>]) -> Result<Vec<Value>> {
let mut all_records = Vec::new();
for path in paths {
let path = path.as_ref();
let stem = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
let records = parse_csv(path)?;
for mut record in records {
if let Value::Object(ref mut map) = record {
map.insert("File".to_string(), Value::String(stem.clone()));
}
all_records.push(record);
}
}
Ok(all_records)
}
fn parse_csv_field(field: &str) -> Value {
if field.is_empty() {
return Value::Null;
}
// Try to parse as number (integer or float)
if let Ok(num) = field.parse::<i64>() {
return Value::Number(serde_json::Number::from(num));
}
if let Ok(num) = field.parse::<f64>() {
return Value::Number(
serde_json::Number::from_f64(num).unwrap_or(serde_json::Number::from(0)),
);
}
// Otherwise treat as string
Value::String(field.to_string())
}
#[cfg(test)]
mod tests {
use super::*;
use std::{fs, path::PathBuf};
use tempfile::tempdir;
fn create_temp_csv(content: &str) -> (PathBuf, tempfile::TempDir) {
let dir = tempdir().unwrap();
let path = dir.path().join("test.csv");
fs::write(&path, content).unwrap();
(path, dir)
}
#[test]
fn parse_simple_csv() {
let (path, _dir) =
create_temp_csv("Region,Product,Revenue\nEast,Shirts,1000\nWest,Shirts,800");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0]["Region"], Value::String("East".to_string()));
assert_eq!(records[0]["Product"], Value::String("Shirts".to_string()));
assert_eq!(
records[0]["Revenue"],
Value::Number(serde_json::Number::from(1000))
);
}
#[test]
fn parse_csv_with_floats() {
let (path, _dir) =
create_temp_csv("Region,Revenue,Cost\nEast,1000.50,600.25\nWest,800.75,500.00");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 2);
assert!(records[0]["Revenue"].is_f64());
assert_eq!(
records[0]["Revenue"],
Value::Number(serde_json::Number::from_f64(1000.50).unwrap())
);
}
#[test]
fn parse_csv_with_quoted_fields() {
let (path, _dir) =
create_temp_csv("Product,Description,Price\n\"Shirts\",\"A nice shirt\",10.00");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0]["Product"], Value::String("Shirts".to_string()));
assert_eq!(
records[0]["Description"],
Value::String("A nice shirt".to_string())
);
}
#[test]
fn parse_csv_with_empty_values() {
let (path, _dir) = create_temp_csv("Region,Product,Revenue\nEast,,1000\nWest,Shirts,");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0]["Product"], Value::Null);
assert_eq!(records[1]["Revenue"], Value::Null);
}
#[test]
fn parse_csv_mixed_types() {
let (path, _dir) =
create_temp_csv("Name,Count,Price,Active\nWidget,5,9.99,true\nGadget,3,19.99,false");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0]["Name"], Value::String("Widget".to_string()));
assert_eq!(
records[0]["Count"],
Value::Number(serde_json::Number::from(5))
);
assert!(records[0]["Price"].is_f64());
assert_eq!(records[0]["Active"], Value::String("true".to_string()));
}
#[test]
fn merge_csvs_adds_file_field_from_stem() {
let dir = tempdir().unwrap();
let sales = dir.path().join("sales.csv");
let expenses = dir.path().join("expenses.csv");
fs::write(&sales, "Region,Revenue\nEast,100\nWest,200").unwrap();
fs::write(&expenses, "Region,Revenue\nEast,50\nWest,75").unwrap();
let records = merge_csvs(&[sales, expenses]).unwrap();
assert_eq!(records.len(), 4);
assert_eq!(records[0]["File"], Value::String("sales".to_string()));
assert_eq!(records[1]["File"], Value::String("sales".to_string()));
assert_eq!(records[2]["File"], Value::String("expenses".to_string()));
assert_eq!(records[3]["File"], Value::String("expenses".to_string()));
// Original fields preserved
assert_eq!(records[0]["Region"], Value::String("East".to_string()));
assert_eq!(
records[2]["Revenue"],
Value::Number(serde_json::Number::from(50))
);
}
#[test]
fn merge_csvs_single_file_works() {
let dir = tempdir().unwrap();
let path = dir.path().join("data.csv");
fs::write(&path, "Name,Value\nA,1").unwrap();
let records = merge_csvs(&[path]).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0]["File"], Value::String("data".to_string()));
assert_eq!(records[0]["Name"], Value::String("A".to_string()));
}
// ── RFC 4180 edge cases ───────────────────────────────────────────
#[test]
fn rfc4180_embedded_comma_in_quoted_field() {
let (path, _dir) =
create_temp_csv("Name,Address,Value\n\"Smith, John\",\"123 Main St, Apt 4\",100");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0]["Name"], Value::String("Smith, John".to_string()));
assert_eq!(
records[0]["Address"],
Value::String("123 Main St, Apt 4".to_string())
);
}
#[test]
fn rfc4180_escaped_quotes_in_field() {
// RFC 4180: doubled quotes ("") inside a quoted field represent a literal quote
let (path, _dir) =
create_temp_csv("Name,Description,Value\nWidget,\"A \"\"great\"\" product\",10");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(
records[0]["Description"],
Value::String("A \"great\" product".to_string())
);
}
#[test]
fn rfc4180_newline_in_quoted_field() {
// RFC 4180: quoted fields may contain newlines
let (path, _dir) = create_temp_csv("Name,Notes,Value\n\"Widget\",\"Line 1\nLine 2\",10");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(
records[0]["Notes"],
Value::String("Line 1\nLine 2".to_string())
);
}
#[test]
fn rfc4180_embedded_comma_and_quotes_combined() {
let (path, _dir) =
create_temp_csv("Name,Desc\n\"Smith, \"\"Jr.\"\"\",\"Said \"\"hello, world\"\"\"");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(
records[0]["Name"],
Value::String("Smith, \"Jr.\"".to_string())
);
assert_eq!(
records[0]["Desc"],
Value::String("Said \"hello, world\"".to_string())
);
}
#[test]
fn parse_checking_csv_format() {
// Simulates the format of /Users/edwlan/Downloads/Checking1.csv
let (path, _dir) = create_temp_csv(
"Date,Amount,Flag,CheckNo,Description\n\
\"03/31/2026\",\"-50.00\",\"*\",\"\",\"VENMO PAYMENT 260331\"\n\
\"03/31/2026\",\"-240.00\",\"*\",\"\",\"ROBINHOOD DEBITS XXXXX3795\"",
);
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0]["Date"], Value::String("03/31/2026".to_string()));
assert_eq!(
records[0]["Amount"],
Value::Number(serde_json::Number::from_f64(-50.00).unwrap())
);
assert_eq!(records[0]["Flag"], Value::String("*".to_string()));
assert_eq!(records[0]["CheckNo"], Value::Null);
assert_eq!(
records[0]["Description"],
Value::String("VENMO PAYMENT 260331".to_string())
);
assert_eq!(
records[1]["Amount"],
Value::Number(serde_json::Number::from_f64(-240.00).unwrap())
);
}
}

View File

@ -0,0 +1,3 @@
pub mod analyzer;
pub mod csv_parser;
pub mod wizard;

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,8 @@
//! Re-exports the core modules under their conventional names so code in
//! this crate can keep using `crate::model::*`, `crate::view::*`,
//! `crate::workbook::*`, `crate::format::*`, and `crate::formula::*` paths.
//!
//! Scaffolded empty in this commit; the modules land in the next commit.
pub use improvise_core::{format, model, view, workbook};
pub use improvise_formula as formula;
pub mod import;
pub mod persistence;

View File

@ -0,0 +1,123 @@
// ── .improv file grammar (v2025-04-09) ───────────────────────────────────────
//
// Line-oriented, markdown-flavoured format for multi-dimensional models.
// Sections may appear in any order.
//
// Names: bare alphanumeric or pipe-quoted |like this|.
// Inside pipes, backslash escapes: \| for literal pipe, \\ for backslash,
// \n for newline.
// Values: pipe-quoted |text| or bare numbers.
file = {
SOI ~
version_line ~
model_name ~
initial_view? ~
section* ~
EOI
}
version_line = { "v2025-04-09" ~ NEWLINE ~ blank_lines }
model_name = { "# " ~ rest_of_line ~ NEWLINE ~ blank_lines }
initial_view = { "Initial View: " ~ rest_of_line ~ NEWLINE ~ blank_lines }
section = _{
category_section
| formulas_section
| data_section
| view_section
}
// ── Category ─────────────────────────────────────────────────────────────────
category_section = {
"## Category: " ~ rest_of_line ~ NEWLINE ~ blank_lines ~
category_entry*
}
category_entry = _{ group_hierarchy | grouped_item | item_list }
// Comma-separated bare items (no group): `- Food, Gas, Total`
item_list = {
"- " ~ name ~ ("," ~ " "* ~ name)* ~ NEWLINE ~ blank_lines
}
// Single item with group bracket: `- Jan[Q1]`
grouped_item = {
"- " ~ name ~ "[" ~ name ~ "]" ~ NEWLINE ~ blank_lines
}
group_hierarchy = {
"> " ~ name ~ "[" ~ name ~ "]" ~ NEWLINE ~ blank_lines
}
// ── Formulas ─────────────────────────────────────────────────────────────────
formulas_section = {
"## Formulas" ~ NEWLINE ~ blank_lines ~
formula_line*
}
formula_line = {
"- " ~ rest_of_line ~ NEWLINE ~ blank_lines
}
// ── Data ─────────────────────────────────────────────────────────────────────
data_section = {
"## Data" ~ NEWLINE ~ blank_lines ~
data_line*
}
data_line = {
coord_list ~ " = " ~ cell_value ~ NEWLINE ~ blank_lines
}
coord_list = { coord ~ (", " ~ coord)* }
coord = { name ~ "=" ~ name }
cell_value = _{ number | pipe_quoted | bare_value }
number = @{
"-"? ~ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? ~ (("e" | "E") ~ ("+" | "-")? ~ ASCII_DIGIT+)?
}
bare_value = @{ (!NEWLINE ~ ANY)+ }
// ── View ─────────────────────────────────────────────────────────────────────
view_section = {
"## View: " ~ rest_of_line ~ NEWLINE ~ blank_lines ~
view_entry*
}
view_entry = _{ format_line | hidden_line | collapsed_line | axis_line }
axis_line = {
name ~ ": " ~ axis_kind ~ (", " ~ name)? ~ NEWLINE ~ blank_lines
}
axis_kind = @{ "row" | "column" | "page" | "none" }
format_line = { "format: " ~ rest_of_line ~ NEWLINE ~ blank_lines }
hidden_line = { "hidden: " ~ name ~ "/" ~ name ~ NEWLINE ~ blank_lines }
collapsed_line = { "collapsed: " ~ name ~ "/" ~ name ~ NEWLINE ~ blank_lines }
// ── Names ────────────────────────────────────────────────────────────────────
//
// A name is either pipe-quoted or a bare identifier.
// Pipe-quoted: |Income, Gross| — backslash escapes inside:
// \| = literal pipe, \\ = literal backslash, \n = newline
// Bare: no = , | [ ] / : # or newlines.
name = _{ pipe_quoted | bare_name }
pipe_quoted = { "|" ~ pipe_inner ~ "|" }
pipe_inner = @{ ("\\" ~ ANY | !"|" ~ ANY)* }
bare_name = @{ ('A'..'Z' | 'a'..'z' | "_") ~ ('A'..'Z' | 'a'..'z' | '0'..'9' | "_" | "-")* }
// ── Shared ───────────────────────────────────────────────────────────────────
rest_of_line = @{ (!NEWLINE ~ ANY)* }
blank_lines = _{ NEWLINE* }

File diff suppressed because it is too large Load Diff