feat(io): enhance CSV import with warnings and improved headless parsing

`parse_csv` now supports `parse_csv_with_warnings` to detect and report
short rows.

Short rows are now padded with `Value::Null` instead of being silently
dropped.

`ImportJsonHeadless` now uses `parse_csv_with_warnings` and surfaces
warnings in the status message.

`ImportJsonHeadless` now reuses parsed JSON/CSV data via
`json_import_records` instead of re-parsing.

Add regression tests for short row handling and headless import.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
Edward Langley
2026-06-09 21:43:13 -07:00
parent f04fe517ae
commit f0b9227d8f
2 changed files with 203 additions and 40 deletions
+76 -3
View File
@@ -9,8 +9,22 @@ pub fn csv_path_p(path: &Path) -> bool {
.is_some_and(|ext| ext.eq_ignore_ascii_case("csv"))
}
/// Parse a CSV file and return records as serde_json::Value array
/// Parse a CSV file and return records as serde_json::Value array.
/// Warnings about short rows are printed to stderr; use
/// [`parse_csv_with_warnings`] to capture them instead.
pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
let (records, warnings) = parse_csv_with_warnings(path)?;
for warning in &warnings {
eprintln!("warning: {warning}");
}
Ok(records)
}
/// Parse a CSV file, returning records plus warnings.
/// Rows shorter than the header keep every header column — missing trailing
/// fields are filled with `Value::Null` — and each affected column is
/// reported as a warning with the number of rows it was missing from.
pub fn parse_csv_with_warnings(path: &Path) -> Result<(Vec<Value>, Vec<String>)> {
let mut reader = ReaderBuilder::new()
.has_headers(true)
.flexible(true)
@@ -22,7 +36,7 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
let has_headers = reader.headers().is_ok();
let mut records = Vec::new();
let mut headers = Vec::new();
let mut headers: Vec<String> = Vec::new();
if has_headers {
headers = reader
@@ -33,6 +47,9 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
.collect();
}
// Per-column count of rows that were too short to provide a value.
let mut missing_counts = vec![0usize; headers.len()];
for result in reader.records() {
let record = result.with_context(|| "Failed to read CSV record")?;
let mut map = serde_json::Map::new();
@@ -48,12 +65,28 @@ pub fn parse_csv(path: &Path) -> Result<Vec<Value>> {
}
}
// Short row: keep the trailing header columns (as null) rather than
// silently dropping them, and remember which columns were affected.
for (i, header) in headers.iter().enumerate().skip(record.len()) {
map.insert(header.clone(), Value::Null);
missing_counts[i] += 1;
}
if !map.is_empty() {
records.push(Value::Object(map));
}
}
Ok(records)
let warnings = headers
.iter()
.zip(&missing_counts)
.filter(|&(_, &count)| count > 0)
.map(|(header, count)| {
format!("column '{header}' missing from {count} row(s); filled with empty values")
})
.collect();
Ok((records, warnings))
}
/// Parse multiple CSV files and merge into a single JSON array.
@@ -270,6 +303,46 @@ mod tests {
);
}
/// BUG (improvise-k8i): the reader is configured with `.flexible(true)`,
/// so rows shorter than the header parsed without error — but the missing
/// trailing columns were simply absent from the record map. No error, no
/// warning, and field-kind analysis saw skewed value counts. Short rows
/// must keep every header column (missing ones as Null), never silently
/// drop them.
#[test]
fn short_rows_pad_missing_trailing_columns_with_null() {
let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4");
let records = parse_csv(&path).unwrap();
assert_eq!(records.len(), 3);
let short = records[1].as_object().unwrap();
assert!(
short.contains_key("E"),
"missing trailing column must be present (as null), not silently dropped"
);
assert_eq!(short["E"], Value::Null);
}
/// Companion to the test above (improvise-k8i): the warning must name the
/// affected column and say how many rows were short.
#[test]
fn short_rows_produce_warning_naming_column_and_count() {
let (path, _dir) = create_temp_csv("A,B,C,D,E\n1,2,3,4,5\n1,2,3,4\n1,2,3,4");
let (records, warnings) = parse_csv_with_warnings(&path).unwrap();
assert_eq!(records.len(), 3);
assert_eq!(warnings.len(), 1);
assert!(warnings[0].contains("'E'"), "warning names the column: {}", warnings[0]);
assert!(warnings[0].contains('2'), "warning counts affected rows: {}", warnings[0]);
}
#[test]
fn full_rows_produce_no_warnings() {
let (path, _dir) = create_temp_csv("A,B\n1,2\n3,4");
let (_, warnings) = parse_csv_with_warnings(&path).unwrap();
assert!(warnings.is_empty());
}
#[test]
fn parse_checking_csv_format() {
// Simulates the format of /Users/edwlan/Downloads/Checking1.csv