diff --git a/crates/improvise-io/src/import/csv_parser.rs b/crates/improvise-io/src/import/csv_parser.rs index 05c22f4..a02de96 100644 --- a/crates/improvise-io/src/import/csv_parser.rs +++ b/crates/improvise-io/src/import/csv_parser.rs @@ -9,8 +9,22 @@ pub fn csv_path_p(path: &Path) -> bool { .is_some_and(|ext| ext.eq_ignore_ascii_case("csv")) } -/// Parse a CSV file and return records as serde_json::Value array +/// Parse a CSV file and return records as serde_json::Value array. +/// Warnings about short rows are printed to stderr; use +/// [`parse_csv_with_warnings`] to capture them instead. pub fn parse_csv(path: &Path) -> Result> { + let (records, warnings) = parse_csv_with_warnings(path)?; + for warning in &warnings { + eprintln!("warning: {warning}"); + } + Ok(records) +} + +/// Parse a CSV file, returning records plus warnings. +/// Rows shorter than the header keep every header column — missing trailing +/// fields are filled with `Value::Null` — and each affected column is +/// reported as a warning with the number of rows it was missing from. +pub fn parse_csv_with_warnings(path: &Path) -> Result<(Vec, Vec)> { let mut reader = ReaderBuilder::new() .has_headers(true) .flexible(true) @@ -22,7 +36,7 @@ pub fn parse_csv(path: &Path) -> Result> { let has_headers = reader.headers().is_ok(); let mut records = Vec::new(); - let mut headers = Vec::new(); + let mut headers: Vec = Vec::new(); if has_headers { headers = reader @@ -33,6 +47,9 @@ pub fn parse_csv(path: &Path) -> Result> { .collect(); } + // Per-column count of rows that were too short to provide a value. + let mut missing_counts = vec![0usize; headers.len()]; + for result in reader.records() { let record = result.with_context(|| "Failed to read CSV record")?; let mut map = serde_json::Map::new(); @@ -48,12 +65,28 @@ pub fn parse_csv(path: &Path) -> Result> { } } + // Short row: keep the trailing header columns (as null) rather than + // silently dropping them, and remember which columns were affected. + for (i, header) in headers.iter().enumerate().skip(record.len()) { + map.insert(header.clone(), Value::Null); + missing_counts[i] += 1; + } + if !map.is_empty() { records.push(Value::Object(map)); } } - Ok(records) + let warnings = headers + .iter() + .zip(&missing_counts) + .filter(|&(_, &count)| count > 0) + .map(|(header, count)| { + format!("column '{header}' missing from {count} row(s); filled with empty values") + }) + .collect(); + + Ok((records, warnings)) } /// Parse multiple CSV files and merge into a single JSON array. @@ -270,6 +303,46 @@ mod tests { ); } + /// BUG (improvise-k8i): the reader is configured with `.flexible(true)`, + /// so rows shorter than the header parsed without error — but the missing + /// trailing columns were simply absent from the record map. No error, no + /// warning, and field-kind analysis saw skewed value counts. Short rows + /// must keep every header column (missing ones as Null), never silently + /// drop them. + #[test] + fn short_rows_pad_missing_trailing_columns_with_null() { + let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4"); + let records = parse_csv(&path).unwrap(); + + assert_eq!(records.len(), 3); + let short = records[1].as_object().unwrap(); + assert!( + short.contains_key("E"), + "missing trailing column must be present (as null), not silently dropped" + ); + assert_eq!(short["E"], Value::Null); + } + + /// Companion to the test above (improvise-k8i): the warning must name the + /// affected column and say how many rows were short. + #[test] + fn short_rows_produce_warning_naming_column_and_count() { + let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4"); + let (records, warnings) = parse_csv_with_warnings(&path).unwrap(); + + assert_eq!(records.len(), 3); + assert_eq!(warnings.len(), 1); + assert!(warnings[0].contains("'E'"), "warning names the column: {}", warnings[0]); + assert!(warnings[0].contains('2'), "warning counts affected rows: {}", warnings[0]); + } + + #[test] + fn full_rows_produce_no_warnings() { + let (path, _dir) = create_temp_csv("A,B\n1,2\n3,4"); + let (_, warnings) = parse_csv_with_warnings(&path).unwrap(); + assert!(warnings.is_empty()); + } + #[test] fn parse_checking_csv_format() { // Simulates the format of /Users/edwlan/Downloads/Checking1.csv diff --git a/src/ui/effect.rs b/src/ui/effect.rs index 8873311..6e94dae 100644 --- a/src/ui/effect.rs +++ b/src/ui/effect.rs @@ -768,6 +768,33 @@ impl Effect for LoadModel { } } +/// Extract the records array from an already-parsed JSON import value. +/// Precedence: explicit `array_path` → root array → first auto-detected +/// array path. Pure — lets [`ImportJsonHeadless`] parse the file exactly +/// once and reuse the parsed value as the pipeline's `raw` (improvise-oaq). +fn json_import_records( + value: &serde_json::Value, + array_path: Option<&str>, +) -> Result, String> { + use crate::import::analyzer::{extract_array_at_path, find_array_paths}; + + if let Some(ap) = array_path.filter(|s| !s.is_empty()) { + return extract_array_at_path(value, ap) + .cloned() + .ok_or_else(|| format!("No array at path '{ap}'")); + } + if let Some(arr) = value.as_array() { + return Ok(arr.clone()); + } + let paths = find_array_paths(value); + match paths.first() { + Some(first) => extract_array_at_path(value, first) + .cloned() + .ok_or_else(|| "Could not extract records array".to_string()), + None => Err("No array found in JSON".to_string()), + } +} + /// Headless JSON/CSV import: read file, analyze, build model, replace current. #[derive(Debug)] pub struct ImportJsonHeadless { @@ -777,9 +804,7 @@ pub struct ImportJsonHeadless { } impl Effect for ImportJsonHeadless { fn apply(&self, app: &mut App) { - use crate::import::analyzer::{ - FieldKind, analyze_records, extract_array_at_path, find_array_paths, - }; + use crate::import::analyzer::{FieldKind, analyze_records}; use crate::import::wizard::ImportPipeline; let is_csv = self @@ -787,9 +812,17 @@ impl Effect for ImportJsonHeadless { .extension() .is_some_and(|ext| ext.eq_ignore_ascii_case("csv")); - let records = if is_csv { - match crate::import::csv_parser::parse_csv(&self.path) { - Ok(recs) => recs, + // Parse the file exactly once: `records` feeds analysis, `raw` is the + // parsed value reused for the pipeline (improvise-oaq). CSV warnings + // (short rows, improvise-k8i) are threaded into the status message. + let mut warnings: Vec = vec![]; + let (records, raw) = if is_csv { + match crate::import::csv_parser::parse_csv_with_warnings(&self.path) { + Ok((recs, w)) => { + warnings = w; + let raw = serde_json::Value::Array(recs.clone()); + (recs, raw) + } Err(e) => { app.view_state.status_msg = format!("CSV error: {e}"); return; @@ -810,29 +843,10 @@ impl Effect for ImportJsonHeadless { return; } }; - - if let Some(ap) = self.array_path.as_deref().filter(|s| !s.is_empty()) { - match extract_array_at_path(&value, ap) { - Some(arr) => arr.clone(), - None => { - app.view_state.status_msg = format!("No array at path '{ap}'"); - return; - } - } - } else if let Some(arr) = value.as_array() { - arr.clone() - } else { - let paths = find_array_paths(&value); - if let Some(first) = paths.first() { - match extract_array_at_path(&value, first) { - Some(arr) => arr.clone(), - None => { - app.view_state.status_msg = "Could not extract records array".to_string(); - return; - } - } - } else { - app.view_state.status_msg = "No array found in JSON".to_string(); + match json_import_records(&value, self.array_path.as_deref()) { + Ok(recs) => (recs, value), + Err(msg) => { + app.view_state.status_msg = msg; return; } } @@ -840,13 +854,6 @@ impl Effect for ImportJsonHeadless { let proposals = analyze_records(&records); - let raw = if is_csv { - serde_json::Value::Array(records.clone()) - } else { - serde_json::from_str(&std::fs::read_to_string(&self.path).unwrap_or_default()) - .unwrap_or(serde_json::Value::Array(records.clone())) - }; - let pipeline = ImportPipeline { raw, array_paths: vec![], @@ -870,7 +877,11 @@ impl Effect for ImportJsonHeadless { match pipeline.build_model() { Ok(new_workbook) => { app.model_state.workbook = new_workbook; - app.view_state.status_msg = "Imported successfully".to_string(); + app.view_state.status_msg = if warnings.is_empty() { + "Imported successfully".to_string() + } else { + format!("Imported with warnings: {}", warnings.join("; ")) + }; } Err(e) => { app.view_state.status_msg = format!("Import error: {e}"); @@ -1857,4 +1868,83 @@ mod tests { assert_eq!(Panel::Category.mode(), AppMode::CategoryPanel); assert_eq!(Panel::View.mode(), AppMode::ViewPanel); } + + // ── Headless import ───────────────────────────────────────────────── + + /// BUG (improvise-k8i): ImportJsonHeadless parsed CSVs with `parse_csv`, + /// which sends short-row warnings to stderr — invisible inside the TUI. + /// Short rows must surface in the status message, naming the affected + /// column, never silently succeed. + #[test] + fn import_headless_csv_short_rows_surface_warning_in_status() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("data.csv"); + std::fs::write( + &path, + "Region,Product,Revenue\nEast,Shirts,100\nWest,Pants,200\nEast,Pants\n", + ) + .unwrap(); + + let mut app = test_app(); + ImportJsonHeadless { + path, + model_name: None, + array_path: None, + } + .apply(&mut app); + + assert!( + app.view_state.status_msg.contains("Revenue"), + "status must name the short column, got: {}", + app.view_state.status_msg + ); + } + + #[test] + fn import_headless_csv_full_rows_report_plain_success() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("data.csv"); + std::fs::write(&path, "Region,Revenue\nEast,100\nWest,200\n").unwrap(); + + let mut app = test_app(); + ImportJsonHeadless { + path, + model_name: None, + array_path: None, + } + .apply(&mut app); + + assert_eq!(app.view_state.status_msg, "Imported successfully"); + } + + // ── json_import_records (improvise-oaq helper) ────────────────────── + + #[test] + fn json_import_records_root_array() { + let value = serde_json::json!([{"x": 1}, {"x": 2}]); + let recs = json_import_records(&value, None).unwrap(); + assert_eq!(recs.len(), 2); + } + + #[test] + fn json_import_records_explicit_path() { + let value = serde_json::json!({"data": [{"x": 1}], "meta": {"v": 1}}); + let recs = json_import_records(&value, Some("data")).unwrap(); + assert_eq!(recs.len(), 1); + } + + #[test] + fn json_import_records_auto_detects_single_path() { + let value = serde_json::json!({"rows": [{"x": 1}, {"x": 2}, {"x": 3}]}); + let recs = json_import_records(&value, None).unwrap(); + assert_eq!(recs.len(), 3); + } + + #[test] + fn json_import_records_errors_when_no_array() { + let value = serde_json::json!({"meta": {"v": 1}}); + assert!(json_import_records(&value, None).is_err()); + let value = serde_json::json!([{"x": 1}]); + assert!(json_import_records(&value, Some("nope")).is_err()); + } }