Extend FieldProposal with chrono-based date format detection and configurable component extraction (Year, Month, Quarter). Add ConfigureDates and DefineFormulas wizard steps to ImportPipeline. build_model injects derived date categories and parses formula strings. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
293 lines
9.0 KiB
Rust
293 lines
9.0 KiB
Rust
use chrono::{Datelike, NaiveDate};
|
|
use serde_json::Value;
|
|
use std::collections::HashSet;
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum FieldKind {
|
|
/// Small number of distinct string values → dimension/category
|
|
Category,
|
|
/// Numeric values → measure
|
|
Measure,
|
|
/// Date/time strings → time category
|
|
TimeCategory,
|
|
/// Many unique strings (IDs, names) → label/identifier
|
|
Label,
|
|
}
|
|
|
|
/// Date components that can be extracted from a date field.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum DateComponent {
|
|
Year,
|
|
Month,
|
|
Quarter,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct FieldProposal {
|
|
pub field: String,
|
|
pub kind: FieldKind,
|
|
pub distinct_values: Vec<String>,
|
|
pub accepted: bool,
|
|
/// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory.
|
|
pub date_format: Option<String>,
|
|
/// Which date components to extract as new categories.
|
|
pub date_components: Vec<DateComponent>,
|
|
}
|
|
|
|
impl FieldProposal {
|
|
pub fn kind_label(&self) -> &'static str {
|
|
match self.kind {
|
|
FieldKind::Category => "Category (dimension)",
|
|
FieldKind::Measure => "Measure (numeric)",
|
|
FieldKind::TimeCategory => "Time Category",
|
|
FieldKind::Label => "Label/Identifier (skip)",
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Common date formats to try, in order of preference.
|
|
const DATE_FORMATS: &[&str] = &[
|
|
"%Y-%m-%d", // 2025-04-02
|
|
"%m/%d/%Y", // 04/02/2025
|
|
"%m/%d/%y", // 04/02/25
|
|
"%d/%m/%Y", // 02/04/2025
|
|
"%Y%m%d", // 20250402
|
|
"%b %d, %Y", // Apr 02, 2025
|
|
"%B %d, %Y", // April 02, 2025
|
|
"%d-%b-%Y", // 02-Apr-2025
|
|
];
|
|
|
|
/// Try to detect a chrono date format from sample values.
|
|
/// Returns the first format that successfully parses all non-empty samples.
|
|
pub fn detect_date_format(samples: &[&str]) -> Option<String> {
|
|
let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect();
|
|
if samples.is_empty() {
|
|
return None;
|
|
}
|
|
// Try up to 10 samples for efficiency
|
|
let test_samples: Vec<&str> = samples.into_iter().take(10).collect();
|
|
for fmt in DATE_FORMATS {
|
|
if test_samples
|
|
.iter()
|
|
.all(|s| NaiveDate::parse_from_str(s, fmt).is_ok())
|
|
{
|
|
return Some(fmt.to_string());
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Parse a date string and extract a component value.
|
|
pub fn extract_date_component(
|
|
value: &str,
|
|
format: &str,
|
|
component: DateComponent,
|
|
) -> Option<String> {
|
|
let date = NaiveDate::parse_from_str(value, format).ok()?;
|
|
Some(match component {
|
|
DateComponent::Year => format!("{}", date.format("%Y")),
|
|
DateComponent::Month => format!("{}", date.format("%Y-%m")),
|
|
DateComponent::Quarter => {
|
|
let q = (date.month0() / 3) + 1;
|
|
format!("{}-Q{}", date.format("%Y"), q)
|
|
}
|
|
})
|
|
}
|
|
|
|
const CATEGORY_THRESHOLD: usize = 20;
|
|
|
|
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
|
|
if records.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
// Collect all field names
|
|
let mut fields: Vec<String> = Vec::new();
|
|
for record in records {
|
|
if let Value::Object(map) = record {
|
|
for key in map.keys() {
|
|
if !fields.contains(key) {
|
|
fields.push(key.clone());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fields
|
|
.into_iter()
|
|
.map(|field| {
|
|
let values: Vec<&Value> = records.iter().filter_map(|r| r.get(&field)).collect();
|
|
|
|
let all_numeric = values.iter().all(|v| v.is_number());
|
|
let all_string = values.iter().all(|v| v.is_string());
|
|
|
|
if all_numeric {
|
|
return FieldProposal {
|
|
field,
|
|
kind: FieldKind::Measure,
|
|
distinct_values: vec![],
|
|
accepted: true,
|
|
date_format: None,
|
|
date_components: vec![],
|
|
};
|
|
}
|
|
|
|
if all_string {
|
|
let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect();
|
|
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
|
|
let n = distinct_vec.len();
|
|
|
|
// Try chrono-based date detection
|
|
let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect();
|
|
let date_format = detect_date_format(&samples);
|
|
|
|
if date_format.is_some() {
|
|
return FieldProposal {
|
|
field,
|
|
kind: FieldKind::TimeCategory,
|
|
distinct_values: distinct_vec,
|
|
accepted: true,
|
|
date_format,
|
|
date_components: vec![],
|
|
};
|
|
}
|
|
|
|
if n <= CATEGORY_THRESHOLD {
|
|
return FieldProposal {
|
|
field,
|
|
kind: FieldKind::Category,
|
|
distinct_values: distinct_vec,
|
|
accepted: true,
|
|
date_format: None,
|
|
date_components: vec![],
|
|
};
|
|
}
|
|
|
|
return FieldProposal {
|
|
field,
|
|
kind: FieldKind::Label,
|
|
distinct_values: distinct_vec,
|
|
accepted: false,
|
|
date_format: None,
|
|
date_components: vec![],
|
|
};
|
|
}
|
|
|
|
// Mixed or other: treat as label
|
|
FieldProposal {
|
|
field,
|
|
kind: FieldKind::Label,
|
|
distinct_values: vec![],
|
|
accepted: false,
|
|
date_format: None,
|
|
date_components: vec![],
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Extract nested array from JSON by dot-path
|
|
pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec<Value>> {
|
|
if path.is_empty() {
|
|
return value.as_array();
|
|
}
|
|
let mut current = value;
|
|
for part in path.split('.') {
|
|
current = current.get(part)?;
|
|
}
|
|
current.as_array()
|
|
}
|
|
|
|
/// Find candidate paths to arrays in JSON
|
|
pub fn find_array_paths(value: &Value) -> Vec<String> {
|
|
let mut paths = Vec::new();
|
|
find_array_paths_inner(value, "", &mut paths);
|
|
paths
|
|
}
|
|
|
|
fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>) {
|
|
match value {
|
|
Value::Array(_) => {
|
|
paths.push(prefix.to_string());
|
|
}
|
|
Value::Object(map) => {
|
|
for (key, val) in map {
|
|
let path = if prefix.is_empty() {
|
|
key.clone()
|
|
} else {
|
|
format!("{prefix}.{key}")
|
|
};
|
|
find_array_paths_inner(val, &path, paths);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn detect_iso_date_format() {
|
|
let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"];
|
|
assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn detect_us_date_format() {
|
|
let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"];
|
|
assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn detect_short_year_format() {
|
|
// Two-digit years are ambiguous with four-digit format, so %m/%d/%Y
|
|
// matches first. This is expected — the user can override in the wizard.
|
|
let samples = vec!["03/31/26", "01/15/25"];
|
|
assert!(detect_date_format(&samples).is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn detect_no_date_format() {
|
|
let samples = vec!["hello", "world"];
|
|
assert_eq!(detect_date_format(&samples), None);
|
|
}
|
|
|
|
#[test]
|
|
fn extract_year_component() {
|
|
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year);
|
|
assert_eq!(result, Some("2026".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn extract_month_component() {
|
|
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month);
|
|
assert_eq!(result, Some("2026-03".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn extract_quarter_component() {
|
|
let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter);
|
|
assert_eq!(result, Some("2026-Q1".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn extract_quarter_q4() {
|
|
let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter);
|
|
assert_eq!(result, Some("2025-Q4".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn analyze_detects_time_category_with_format() {
|
|
let records: Vec<Value> = vec![
|
|
serde_json::json!({"Date": "01/15/2025", "Amount": 100}),
|
|
serde_json::json!({"Date": "02/20/2025", "Amount": 200}),
|
|
];
|
|
let proposals = analyze_records(&records);
|
|
let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap();
|
|
assert_eq!(date_prop.kind, FieldKind::TimeCategory);
|
|
assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string()));
|
|
}
|
|
}
|