feat: date parsing, component extraction, and wizard formulas
Extend FieldProposal with chrono-based date format detection and configurable component extraction (Year, Month, Quarter). Add ConfigureDates and DefineFormulas wizard steps to ImportPipeline. build_model injects derived date categories and parses formula strings. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use serde_json::Value;
|
||||
use std::collections::HashSet;
|
||||
|
||||
@ -13,12 +14,24 @@ pub enum FieldKind {
|
||||
Label,
|
||||
}
|
||||
|
||||
/// Date components that can be extracted from a date field.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DateComponent {
|
||||
Year,
|
||||
Month,
|
||||
Quarter,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FieldProposal {
|
||||
pub field: String,
|
||||
pub kind: FieldKind,
|
||||
pub distinct_values: Vec<String>,
|
||||
pub accepted: bool,
|
||||
/// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory.
|
||||
pub date_format: Option<String>,
|
||||
/// Which date components to extract as new categories.
|
||||
pub date_components: Vec<DateComponent>,
|
||||
}
|
||||
|
||||
impl FieldProposal {
|
||||
@ -32,6 +45,55 @@ impl FieldProposal {
|
||||
}
|
||||
}
|
||||
|
||||
/// Common date formats to try, in order of preference.
|
||||
const DATE_FORMATS: &[&str] = &[
|
||||
"%Y-%m-%d", // 2025-04-02
|
||||
"%m/%d/%Y", // 04/02/2025
|
||||
"%m/%d/%y", // 04/02/25
|
||||
"%d/%m/%Y", // 02/04/2025
|
||||
"%Y%m%d", // 20250402
|
||||
"%b %d, %Y", // Apr 02, 2025
|
||||
"%B %d, %Y", // April 02, 2025
|
||||
"%d-%b-%Y", // 02-Apr-2025
|
||||
];
|
||||
|
||||
/// Try to detect a chrono date format from sample values.
|
||||
/// Returns the first format that successfully parses all non-empty samples.
|
||||
pub fn detect_date_format(samples: &[&str]) -> Option<String> {
|
||||
let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect();
|
||||
if samples.is_empty() {
|
||||
return None;
|
||||
}
|
||||
// Try up to 10 samples for efficiency
|
||||
let test_samples: Vec<&str> = samples.into_iter().take(10).collect();
|
||||
for fmt in DATE_FORMATS {
|
||||
if test_samples
|
||||
.iter()
|
||||
.all(|s| NaiveDate::parse_from_str(s, fmt).is_ok())
|
||||
{
|
||||
return Some(fmt.to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse a date string and extract a component value.
|
||||
pub fn extract_date_component(
|
||||
value: &str,
|
||||
format: &str,
|
||||
component: DateComponent,
|
||||
) -> Option<String> {
|
||||
let date = NaiveDate::parse_from_str(value, format).ok()?;
|
||||
Some(match component {
|
||||
DateComponent::Year => format!("{}", date.format("%Y")),
|
||||
DateComponent::Month => format!("{}", date.format("%Y-%m")),
|
||||
DateComponent::Quarter => {
|
||||
let q = (date.month0() / 3) + 1;
|
||||
format!("{}-Q{}", date.format("%Y"), q)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const CATEGORY_THRESHOLD: usize = 20;
|
||||
|
||||
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
@ -65,6 +127,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
kind: FieldKind::Measure,
|
||||
distinct_values: vec![],
|
||||
accepted: true,
|
||||
date_format: None,
|
||||
date_components: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
@ -72,26 +136,19 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect();
|
||||
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
|
||||
let n = distinct_vec.len();
|
||||
let _total = values.len();
|
||||
|
||||
// Check if looks like date
|
||||
let looks_like_date = distinct_vec.iter().any(|s| {
|
||||
s.contains('-') && s.len() >= 8
|
||||
|| s.starts_with("Q") && s.len() == 2
|
||||
|| [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
||||
"Nov", "Dec",
|
||||
]
|
||||
.iter()
|
||||
.any(|m| s.starts_with(m))
|
||||
});
|
||||
// Try chrono-based date detection
|
||||
let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect();
|
||||
let date_format = detect_date_format(&samples);
|
||||
|
||||
if looks_like_date {
|
||||
if date_format.is_some() {
|
||||
return FieldProposal {
|
||||
field,
|
||||
kind: FieldKind::TimeCategory,
|
||||
distinct_values: distinct_vec,
|
||||
accepted: true,
|
||||
date_format,
|
||||
date_components: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
@ -101,6 +158,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
kind: FieldKind::Category,
|
||||
distinct_values: distinct_vec,
|
||||
accepted: true,
|
||||
date_format: None,
|
||||
date_components: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
@ -109,6 +168,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
kind: FieldKind::Label,
|
||||
distinct_values: distinct_vec,
|
||||
accepted: false,
|
||||
date_format: None,
|
||||
date_components: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
@ -118,6 +179,8 @@ pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
||||
kind: FieldKind::Label,
|
||||
distinct_values: vec![],
|
||||
accepted: false,
|
||||
date_format: None,
|
||||
date_components: vec![],
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
@ -160,3 +223,70 @@ fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>)
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn detect_iso_date_format() {
|
||||
let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"];
|
||||
assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_us_date_format() {
|
||||
let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"];
|
||||
assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_short_year_format() {
|
||||
// Two-digit years are ambiguous with four-digit format, so %m/%d/%Y
|
||||
// matches first. This is expected — the user can override in the wizard.
|
||||
let samples = vec!["03/31/26", "01/15/25"];
|
||||
assert!(detect_date_format(&samples).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_no_date_format() {
|
||||
let samples = vec!["hello", "world"];
|
||||
assert_eq!(detect_date_format(&samples), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_year_component() {
|
||||
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year);
|
||||
assert_eq!(result, Some("2026".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_month_component() {
|
||||
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month);
|
||||
assert_eq!(result, Some("2026-03".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_quarter_component() {
|
||||
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter);
|
||||
assert_eq!(result, Some("2026-Q1".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_quarter_q4() {
|
||||
let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter);
|
||||
assert_eq!(result, Some("2025-Q4".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn analyze_detects_time_category_with_format() {
|
||||
let records: Vec<Value> = vec![
|
||||
serde_json::json!({"Date": "01/15/2025", "Amount": 100}),
|
||||
serde_json::json!({"Date": "02/20/2025", "Amount": 200}),
|
||||
];
|
||||
let proposals = analyze_records(&records);
|
||||
let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap();
|
||||
assert_eq!(date_prop.kind, FieldKind::TimeCategory);
|
||||
assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user