feat(io): enhance CSV import with warnings and improved headless parsing
`parse_csv` now supports `parse_csv_with_warnings` to detect and report short rows. Short rows are now padded with `Value::Null` instead of being silently dropped. `ImportJsonHeadless` now uses `parse_csv_with_warnings` and surfaces warnings in the status message. `ImportJsonHeadless` now reuses parsed JSON/CSV data via `json_import_records` instead of re-parsing. Add regression tests for short row handling and headless import. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
@@ -9,8 +9,22 @@ pub fn csv_path_p(path: &Path) -> bool {
|
||||
.is_some_and(|ext| ext.eq_ignore_ascii_case("csv"))
|
||||
}
|
||||
|
||||
/// Parse a CSV file and return records as serde_json::Value array
|
||||
/// Parse a CSV file and return records as serde_json::Value array.
|
||||
/// Warnings about short rows are printed to stderr; use
|
||||
/// [`parse_csv_with_warnings`] to capture them instead.
|
||||
pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
|
||||
let (records, warnings) = parse_csv_with_warnings(path)?;
|
||||
for warning in &warnings {
|
||||
eprintln!("warning: {warning}");
|
||||
}
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
/// Parse a CSV file, returning records plus warnings.
|
||||
/// Rows shorter than the header keep every header column — missing trailing
|
||||
/// fields are filled with `Value::Null` — and each affected column is
|
||||
/// reported as a warning with the number of rows it was missing from.
|
||||
pub fn parse_csv_with_warnings(path: &Path) -> Result<(Vec<Value>, Vec<String>)> {
|
||||
let mut reader = ReaderBuilder::new()
|
||||
.has_headers(true)
|
||||
.flexible(true)
|
||||
@@ -22,7 +36,7 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
|
||||
let has_headers = reader.headers().is_ok();
|
||||
|
||||
let mut records = Vec::new();
|
||||
let mut headers = Vec::new();
|
||||
let mut headers: Vec<String> = Vec::new();
|
||||
|
||||
if has_headers {
|
||||
headers = reader
|
||||
@@ -33,6 +47,9 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
|
||||
.collect();
|
||||
}
|
||||
|
||||
// Per-column count of rows that were too short to provide a value.
|
||||
let mut missing_counts = vec![0usize; headers.len()];
|
||||
|
||||
for result in reader.records() {
|
||||
let record = result.with_context(|| "Failed to read CSV record")?;
|
||||
let mut map = serde_json::Map::new();
|
||||
@@ -48,12 +65,28 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
|
||||
}
|
||||
}
|
||||
|
||||
// Short row: keep the trailing header columns (as null) rather than
|
||||
// silently dropping them, and remember which columns were affected.
|
||||
for (i, header) in headers.iter().enumerate().skip(record.len()) {
|
||||
map.insert(header.clone(), Value::Null);
|
||||
missing_counts[i] += 1;
|
||||
}
|
||||
|
||||
if !map.is_empty() {
|
||||
records.push(Value::Object(map));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
let warnings = headers
|
||||
.iter()
|
||||
.zip(&missing_counts)
|
||||
.filter(|&(_, &count)| count > 0)
|
||||
.map(|(header, count)| {
|
||||
format!("column '{header}' missing from {count} row(s); filled with empty values")
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok((records, warnings))
|
||||
}
|
||||
|
||||
/// Parse multiple CSV files and merge into a single JSON array.
|
||||
@@ -270,6 +303,46 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// BUG (improvise-k8i): the reader is configured with `.flexible(true)`,
|
||||
/// so rows shorter than the header parsed without error — but the missing
|
||||
/// trailing columns were simply absent from the record map. No error, no
|
||||
/// warning, and field-kind analysis saw skewed value counts. Short rows
|
||||
/// must keep every header column (missing ones as Null), never silently
|
||||
/// drop them.
|
||||
#[test]
|
||||
fn short_rows_pad_missing_trailing_columns_with_null() {
|
||||
let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4");
|
||||
let records = parse_csv(&path).unwrap();
|
||||
|
||||
assert_eq!(records.len(), 3);
|
||||
let short = records[1].as_object().unwrap();
|
||||
assert!(
|
||||
short.contains_key("E"),
|
||||
"missing trailing column must be present (as null), not silently dropped"
|
||||
);
|
||||
assert_eq!(short["E"], Value::Null);
|
||||
}
|
||||
|
||||
/// Companion to the test above (improvise-k8i): the warning must name the
|
||||
/// affected column and say how many rows were short.
|
||||
#[test]
|
||||
fn short_rows_produce_warning_naming_column_and_count() {
|
||||
let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4");
|
||||
let (records, warnings) = parse_csv_with_warnings(&path).unwrap();
|
||||
|
||||
assert_eq!(records.len(), 3);
|
||||
assert_eq!(warnings.len(), 1);
|
||||
assert!(warnings[0].contains("'E'"), "warning names the column: {}", warnings[0]);
|
||||
assert!(warnings[0].contains('2'), "warning counts affected rows: {}", warnings[0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn full_rows_produce_no_warnings() {
|
||||
let (path, _dir) = create_temp_csv("A,B\n1,2\n3,4");
|
||||
let (_, warnings) = parse_csv_with_warnings(&path).unwrap();
|
||||
assert!(warnings.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_checking_csv_format() {
|
||||
// Simulates the format of /Users/edwlan/Downloads/Checking1.csv
|
||||
|
||||
+127
-37
@@ -768,6 +768,33 @@ impl Effect for LoadModel {
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the records array from an already-parsed JSON import value.
|
||||
/// Precedence: explicit `array_path` → root array → first auto-detected
|
||||
/// array path. Pure — lets [`ImportJsonHeadless`] parse the file exactly
|
||||
/// once and reuse the parsed value as the pipeline's `raw` (improvise-oaq).
|
||||
fn json_import_records(
|
||||
value: &serde_json::Value,
|
||||
array_path: Option<&str>,
|
||||
) -> Result<Vec<serde_json::Value>, String> {
|
||||
use crate::import::analyzer::{extract_array_at_path, find_array_paths};
|
||||
|
||||
if let Some(ap) = array_path.filter(|s| !s.is_empty()) {
|
||||
return extract_array_at_path(value, ap)
|
||||
.cloned()
|
||||
.ok_or_else(|| format!("No array at path '{ap}'"));
|
||||
}
|
||||
if let Some(arr) = value.as_array() {
|
||||
return Ok(arr.clone());
|
||||
}
|
||||
let paths = find_array_paths(value);
|
||||
match paths.first() {
|
||||
Some(first) => extract_array_at_path(value, first)
|
||||
.cloned()
|
||||
.ok_or_else(|| "Could not extract records array".to_string()),
|
||||
None => Err("No array found in JSON".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Headless JSON/CSV import: read file, analyze, build model, replace current.
|
||||
#[derive(Debug)]
|
||||
pub struct ImportJsonHeadless {
|
||||
@@ -777,9 +804,7 @@ pub struct ImportJsonHeadless {
|
||||
}
|
||||
impl Effect for ImportJsonHeadless {
|
||||
fn apply(&self, app: &mut App) {
|
||||
use crate::import::analyzer::{
|
||||
FieldKind, analyze_records, extract_array_at_path, find_array_paths,
|
||||
};
|
||||
use crate::import::analyzer::{FieldKind, analyze_records};
|
||||
use crate::import::wizard::ImportPipeline;
|
||||
|
||||
let is_csv = self
|
||||
@@ -787,9 +812,17 @@ impl Effect for ImportJsonHeadless {
|
||||
.extension()
|
||||
.is_some_and(|ext| ext.eq_ignore_ascii_case("csv"));
|
||||
|
||||
let records = if is_csv {
|
||||
match crate::import::csv_parser::parse_csv(&self.path) {
|
||||
Ok(recs) => recs,
|
||||
// Parse the file exactly once: `records` feeds analysis, `raw` is the
|
||||
// parsed value reused for the pipeline (improvise-oaq). CSV warnings
|
||||
// (short rows, improvise-k8i) are threaded into the status message.
|
||||
let mut warnings: Vec<String> = vec![];
|
||||
let (records, raw) = if is_csv {
|
||||
match crate::import::csv_parser::parse_csv_with_warnings(&self.path) {
|
||||
Ok((recs, w)) => {
|
||||
warnings = w;
|
||||
let raw = serde_json::Value::Array(recs.clone());
|
||||
(recs, raw)
|
||||
}
|
||||
Err(e) => {
|
||||
app.view_state.status_msg = format!("CSV error: {e}");
|
||||
return;
|
||||
@@ -810,29 +843,10 @@ impl Effect for ImportJsonHeadless {
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(ap) = self.array_path.as_deref().filter(|s| !s.is_empty()) {
|
||||
match extract_array_at_path(&value, ap) {
|
||||
Some(arr) => arr.clone(),
|
||||
None => {
|
||||
app.view_state.status_msg = format!("No array at path '{ap}'");
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else if let Some(arr) = value.as_array() {
|
||||
arr.clone()
|
||||
} else {
|
||||
let paths = find_array_paths(&value);
|
||||
if let Some(first) = paths.first() {
|
||||
match extract_array_at_path(&value, first) {
|
||||
Some(arr) => arr.clone(),
|
||||
None => {
|
||||
app.view_state.status_msg = "Could not extract records array".to_string();
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
app.view_state.status_msg = "No array found in JSON".to_string();
|
||||
match json_import_records(&value, self.array_path.as_deref()) {
|
||||
Ok(recs) => (recs, value),
|
||||
Err(msg) => {
|
||||
app.view_state.status_msg = msg;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -840,13 +854,6 @@ impl Effect for ImportJsonHeadless {
|
||||
|
||||
let proposals = analyze_records(&records);
|
||||
|
||||
let raw = if is_csv {
|
||||
serde_json::Value::Array(records.clone())
|
||||
} else {
|
||||
serde_json::from_str(&std::fs::read_to_string(&self.path).unwrap_or_default())
|
||||
.unwrap_or(serde_json::Value::Array(records.clone()))
|
||||
};
|
||||
|
||||
let pipeline = ImportPipeline {
|
||||
raw,
|
||||
array_paths: vec![],
|
||||
@@ -870,7 +877,11 @@ impl Effect for ImportJsonHeadless {
|
||||
match pipeline.build_model() {
|
||||
Ok(new_workbook) => {
|
||||
app.model_state.workbook = new_workbook;
|
||||
app.view_state.status_msg = "Imported successfully".to_string();
|
||||
app.view_state.status_msg = if warnings.is_empty() {
|
||||
"Imported successfully".to_string()
|
||||
} else {
|
||||
format!("Imported with warnings: {}", warnings.join("; "))
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
app.view_state.status_msg = format!("Import error: {e}");
|
||||
@@ -1857,4 +1868,83 @@ mod tests {
|
||||
assert_eq!(Panel::Category.mode(), AppMode::CategoryPanel);
|
||||
assert_eq!(Panel::View.mode(), AppMode::ViewPanel);
|
||||
}
|
||||
|
||||
// ── Headless import ─────────────────────────────────────────────────
|
||||
|
||||
/// BUG (improvise-k8i): ImportJsonHeadless parsed CSVs with `parse_csv`,
|
||||
/// which sends short-row warnings to stderr — invisible inside the TUI.
|
||||
/// Short rows must surface in the status message, naming the affected
|
||||
/// column, never silently succeed.
|
||||
#[test]
|
||||
fn import_headless_csv_short_rows_surface_warning_in_status() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("data.csv");
|
||||
std::fs::write(
|
||||
&path,
|
||||
"Region,Product,Revenue\nEast,Shirts,100\nWest,Pants,200\nEast,Pants\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut app = test_app();
|
||||
ImportJsonHeadless {
|
||||
path,
|
||||
model_name: None,
|
||||
array_path: None,
|
||||
}
|
||||
.apply(&mut app);
|
||||
|
||||
assert!(
|
||||
app.view_state.status_msg.contains("Revenue"),
|
||||
"status must name the short column, got: {}",
|
||||
app.view_state.status_msg
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn import_headless_csv_full_rows_report_plain_success() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("data.csv");
|
||||
std::fs::write(&path, "Region,Revenue\nEast,100\nWest,200\n").unwrap();
|
||||
|
||||
let mut app = test_app();
|
||||
ImportJsonHeadless {
|
||||
path,
|
||||
model_name: None,
|
||||
array_path: None,
|
||||
}
|
||||
.apply(&mut app);
|
||||
|
||||
assert_eq!(app.view_state.status_msg, "Imported successfully");
|
||||
}
|
||||
|
||||
// ── json_import_records (improvise-oaq helper) ──────────────────────
|
||||
|
||||
#[test]
|
||||
fn json_import_records_root_array() {
|
||||
let value = serde_json::json!([{"x": 1}, {"x": 2}]);
|
||||
let recs = json_import_records(&value, None).unwrap();
|
||||
assert_eq!(recs.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_import_records_explicit_path() {
|
||||
let value = serde_json::json!({"data": [{"x": 1}], "meta": {"v": 1}});
|
||||
let recs = json_import_records(&value, Some("data")).unwrap();
|
||||
assert_eq!(recs.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_import_records_auto_detects_single_path() {
|
||||
let value = serde_json::json!({"rows": [{"x": 1}, {"x": 2}, {"x": 3}]});
|
||||
let recs = json_import_records(&value, None).unwrap();
|
||||
assert_eq!(recs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_import_records_errors_when_no_array() {
|
||||
let value = serde_json::json!({"meta": {"v": 1}});
|
||||
assert!(json_import_records(&value, None).is_err());
|
||||
let value = serde_json::json!([{"x": 1}]);
|
||||
assert!(json_import_records(&value, Some("nope")).is_err());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user