feat: date parsing, component extraction, and wizard formulas

Extend FieldProposal with chrono-based date format detection and
configurable component extraction (Year, Month, Quarter). Add
ConfigureDates and DefineFormulas wizard steps to ImportPipeline.
build_model injects derived date categories and parses formula strings.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Edward Langley
2026-04-03 13:41:05 -07:00
parent 5a251a1cbe
commit a73fe160c7
5 changed files with 441 additions and 31 deletions

View File

@ -1,3 +1,4 @@
use chrono::{Datelike, NaiveDate};
use serde_json::Value;
use std::collections::HashSet;
@ -13,12 +14,24 @@ pub enum FieldKind {
Label,
}
/// Date components that can be extracted from a date field.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DateComponent {
Year,
Month,
Quarter,
}
#[derive(Debug, Clone)]
pub struct FieldProposal {
pub field: String,
pub kind: FieldKind,
pub distinct_values: Vec<String>,
pub accepted: bool,
/// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory.
pub date_format: Option<String>,
/// Which date components to extract as new categories.
pub date_components: Vec<DateComponent>,
}
impl FieldProposal {
@ -32,6 +45,55 @@ impl FieldProposal {
}
}
/// Common date formats to try, in order of preference.
const DATE_FORMATS: &[&str] = &[
"%Y-%m-%d", // 2025-04-02
"%m/%d/%Y", // 04/02/2025
"%m/%d/%y", // 04/02/25
"%d/%m/%Y", // 02/04/2025
"%Y%m%d", // 20250402
"%b %d, %Y", // Apr 02, 2025
"%B %d, %Y", // April 02, 2025
"%d-%b-%Y", // 02-Apr-2025
];
/// Try to detect a chrono date format from sample values.
/// Returns the first format that successfully parses all non-empty samples.
pub fn detect_date_format(samples: &[&str]) -> Option<String> {
let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect();
if samples.is_empty() {
return None;
}
// Try up to 10 samples for efficiency
let test_samples: Vec<&str> = samples.into_iter().take(10).collect();
for fmt in DATE_FORMATS {
if test_samples
.iter()
.all(|s| NaiveDate::parse_from_str(s, fmt).is_ok())
{
return Some(fmt.to_string());
}
}
None
}
/// Parse a date string and extract a component value.
pub fn extract_date_component(
value: &str,
format: &str,
component: DateComponent,
) -> Option<String> {
let date = NaiveDate::parse_from_str(value, format).ok()?;
Some(match component {
DateComponent::Year => format!("{}", date.format("%Y")),
DateComponent::Month => format!("{}", date.format("%Y-%m")),
DateComponent::Quarter => {
let q = (date.month0() / 3) + 1;
format!("{}-Q{}", date.format("%Y"), q)
}
})
}
const CATEGORY_THRESHOLD: usize = 20;
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
@ -65,6 +127,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
kind: FieldKind::Measure,
distinct_values: vec![],
accepted: true,
date_format: None,
date_components: vec![],
};
}
@ -72,26 +136,19 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect();
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
let n = distinct_vec.len();
let _total = values.len();
// Check if looks like date
let looks_like_date = distinct_vec.iter().any(|s| {
s.contains('-') && s.len() >= 8
|| s.starts_with("Q") && s.len() == 2
|| [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec",
]
.iter()
.any(|m| s.starts_with(m))
});
// Try chrono-based date detection
let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect();
let date_format = detect_date_format(&samples);
if looks_like_date {
if date_format.is_some() {
return FieldProposal {
field,
kind: FieldKind::TimeCategory,
distinct_values: distinct_vec,
accepted: true,
date_format,
date_components: vec![],
};
}
@ -101,6 +158,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
kind: FieldKind::Category,
distinct_values: distinct_vec,
accepted: true,
date_format: None,
date_components: vec![],
};
}
@ -109,6 +168,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
kind: FieldKind::Label,
distinct_values: distinct_vec,
accepted: false,
date_format: None,
date_components: vec![],
};
}
@ -118,6 +179,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
kind: FieldKind::Label,
distinct_values: vec![],
accepted: false,
date_format: None,
date_components: vec![],
}
})
.collect()
@ -160,3 +223,70 @@ fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>)
_ => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_iso_date_format() {
let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"];
assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string()));
}
#[test]
fn detect_us_date_format() {
let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"];
assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string()));
}
#[test]
fn detect_short_year_format() {
// Two-digit years are ambiguous with four-digit format, so %m/%d/%Y
// matches first. This is expected — the user can override in the wizard.
let samples = vec!["03/31/26", "01/15/25"];
assert!(detect_date_format(&samples).is_some());
}
#[test]
fn detect_no_date_format() {
let samples = vec!["hello", "world"];
assert_eq!(detect_date_format(&samples), None);
}
#[test]
fn extract_year_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year);
assert_eq!(result, Some("2026".to_string()));
}
#[test]
fn extract_month_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month);
assert_eq!(result, Some("2026-03".to_string()));
}
#[test]
fn extract_quarter_component() {
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2026-Q1".to_string()));
}
#[test]
fn extract_quarter_q4() {
let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter);
assert_eq!(result, Some("2025-Q4".to_string()));
}
#[test]
fn analyze_detects_time_category_with_format() {
let records: Vec<Value> = vec![
serde_json::json!({"Date": "01/15/2025", "Amount": 100}),
serde_json::json!({"Date": "02/20/2025", "Amount": 200}),
];
let proposals = analyze_records(&records);
let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap();
assert_eq!(date_prop.kind, FieldKind::TimeCategory);
assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string()));
}
}