Files
improvise/src/import/analyzer.rs
Edward Langley a73fe160c7 feat: date parsing, component extraction, and wizard formulas
Extend FieldProposal with chrono-based date format detection and
configurable component extraction (Year, Month, Quarter). Add
ConfigureDates and DefineFormulas wizard steps to ImportPipeline.
build_model injects derived date categories and parses formula strings.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 13:41:05 -07:00

293 lines
9.0 KiB
Rust

use chrono::{Datelike, NaiveDate};
use serde_json::Value;
use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq)]
pub enum FieldKind {
/// Small number of distinct string values → dimension/category
Category,
/// Numeric values → measure
Measure,
/// Date/time strings → time category
TimeCategory,
/// Many unique strings (IDs, names) → label/identifier
Label,
}
/// Date components that can be extracted from a date field.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DateComponent {
Year,
Month,
Quarter,
}
#[derive(Debug, Clone)]
pub struct FieldProposal {
pub field: String,
pub kind: FieldKind,
pub distinct_values: Vec<String>,
pub accepted: bool,
/// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory.
pub date_format: Option<String>,
/// Which date components to extract as new categories.
pub date_components: Vec<DateComponent>,
}
impl FieldProposal {
pub fn kind_label(&self) -> &'static str {
match self.kind {
FieldKind::Category => "Category (dimension)",
FieldKind::Measure => "Measure (numeric)",
FieldKind::TimeCategory => "Time Category",
FieldKind::Label => "Label/Identifier (skip)",
}
}
}
/// Common date formats to try, in order of preference.
const DATE_FORMATS: &[&str] = &[
"%Y-%m-%d", // 2025-04-02
"%m/%d/%Y", // 04/02/2025
"%m/%d/%y", // 04/02/25
"%d/%m/%Y", // 02/04/2025
"%Y%m%d", // 20250402
"%b %d, %Y", // Apr 02, 2025
"%B %d, %Y", // April 02, 2025
"%d-%b-%Y", // 02-Apr-2025
];
/// Try to detect a chrono date format from sample values.
/// Returns the first format that successfully parses all non-empty samples.
pub fn detect_date_format(samples: &[&str]) -> Option<String> {
let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect();
if samples.is_empty() {
return None;
}
// Try up to 10 samples for efficiency
let test_samples: Vec<&str> = samples.into_iter().take(10).collect();
for fmt in DATE_FORMATS {
if test_samples
.iter()
.all(|s| NaiveDate::parse_from_str(s, fmt).is_ok())
{
return Some(fmt.to_string());
}
}
None
}
/// Parse a date string and extract a component value.
pub fn extract_date_component(
value: &str,
format: &str,
component: DateComponent,
) -> Option<String> {
let date = NaiveDate::parse_from_str(value, format).ok()?;
Some(match component {
DateComponent::Year => format!("{}", date.format("%Y")),
DateComponent::Month => format!("{}", date.format("%Y-%m")),
DateComponent::Quarter => {
let q = (date.month0() / 3) + 1;
format!("{}-Q{}", date.format("%Y"), q)
}
})
}
const CATEGORY_THRESHOLD: usize = 20;
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
if records.is_empty() {
return vec![];
}
// Collect all field names
let mut fields: Vec<String> = Vec::new();
for record in records {
if let Value::Object(map) = record {
for key in map.keys() {
if !fields.contains(key) {
fields.push(key.clone());
}
}
}
}
fields
.into_iter()
.map(|field| {
let values: Vec<&Value> = records.iter().filter_map(|r| r.get(&field)).collect();
let all_numeric = values.iter().all(|v| v.is_number());
let all_string = values.iter().all(|v| v.is_string());
if all_numeric {
return FieldProposal {
field,
kind: FieldKind::Measure,
distinct_values: vec![],
accepted: true,
date_format: None,
date_components: vec![],
};
}
if all_string {
let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect();
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
let n = distinct_vec.len();
// Try chrono-based date detection
let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect();
let date_format = detect_date_format(&samples);
if date_format.is_some() {
return FieldProposal {
field,
kind: FieldKind::TimeCategory,
distinct_values: distinct_vec,
accepted: true,
date_format,
date_components: vec![],
};
}
if n <= CATEGORY_THRESHOLD {
return FieldProposal {
field,
kind: FieldKind::Category,
distinct_values: distinct_vec,
accepted: true,
date_format: None,
date_components: vec![],
};
}
return FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: distinct_vec,
accepted: false,
date_format: None,
date_components: vec![],
};
}
// Mixed or other: treat as label
FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: vec![],
accepted: false,
date_format: None,
date_components: vec![],
}
})
.collect()
}
/// Extract nested array from JSON by dot-path
pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec<Value>> {
if path.is_empty() {
return value.as_array();
}
let mut current = value;
for part in path.split('.') {
current = current.get(part)?;
}
current.as_array()
}
/// Find candidate paths to arrays in JSON
pub fn find_array_paths(value: &Value) -> Vec<String> {
let mut paths = Vec::new();
find_array_paths_inner(value, "", &mut paths);
paths
}
fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>) {
match value {
Value::Array(_) => {
paths.push(prefix.to_string());
}
Value::Object(map) => {
for (key, val) in map {
let path = if prefix.is_empty() {
key.clone()
} else {
format!("{prefix}.{key}")
};
find_array_paths_inner(val, &path, paths);
}
}
_ => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_iso_date_format() {
let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"];
assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string()));
}
#[test]
fn detect_us_date_format() {
let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"];
assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string()));
}
#[test]
fn detect_short_year_format() {
// Two-digit years are ambiguous with four-digit format, so %m/%d/%Y
// matches first. This is expected — the user can override in the wizard.
let samples = vec!["03/31/26", "01/15/25"];
assert!(detect_date_format(&samples).is_some());
}
#[test]
fn detect_no_date_format() {
let samples = vec!["hello", "world"];
assert_eq!(detect_date_format(&samples), None);
}
#[test]
fn extract_year_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year);
assert_eq!(result, Some("2026".to_string()));
}
#[test]
fn extract_month_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month);
assert_eq!(result, Some("2026-03".to_string()));
}
#[test]
fn extract_quarter_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2026-Q1".to_string()));
}
#[test]
fn extract_quarter_q4() {
let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2025-Q4".to_string()));
}
#[test]
fn analyze_detects_time_category_with_format() {
let records: Vec<Value> = vec![
serde_json::json!({"Date": "01/15/2025", "Amount": 100}),
serde_json::json!({"Date": "02/20/2025", "Amount": 200}),
];
let proposals = analyze_records(&records);
let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap();
assert_eq!(date_prop.kind, FieldKind::TimeCategory);
assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string()));
}
}