use chrono::{Datelike, NaiveDate}; use serde_json::Value; use std::collections::HashSet; #[derive(Debug, Clone, PartialEq)] pub enum FieldKind { /// Small number of distinct string values → dimension/category Category, /// Numeric values → measure Measure, /// Date/time strings → time category TimeCategory, /// Many unique strings (IDs, names) → label/identifier Label, } /// Date components that can be extracted from a date field. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DateComponent { Year, Month, Quarter, } #[derive(Debug, Clone)] pub struct FieldProposal { pub field: String, pub kind: FieldKind, pub distinct_values: Vec, pub accepted: bool, /// Detected chrono format string (e.g., "%m/%d/%Y"). Only set for TimeCategory. pub date_format: Option, /// Which date components to extract as new categories. pub date_components: Vec, } impl FieldProposal { pub fn kind_label(&self) -> &'static str { match self.kind { FieldKind::Category => "Category (dimension)", FieldKind::Measure => "Measure (numeric)", FieldKind::TimeCategory => "Time Category", FieldKind::Label => "Label/Identifier (skip)", } } } /// Common date formats to try, in order of preference. const DATE_FORMATS: &[&str] = &[ "%Y-%m-%d", // 2025-04-02 "%m/%d/%Y", // 04/02/2025 "%m/%d/%y", // 04/02/25 "%d/%m/%Y", // 02/04/2025 "%Y%m%d", // 20250402 "%b %d, %Y", // Apr 02, 2025 "%B %d, %Y", // April 02, 2025 "%d-%b-%Y", // 02-Apr-2025 ]; /// Try to detect a chrono date format from sample values. /// Returns the first format that successfully parses all non-empty samples. pub fn detect_date_format(samples: &[&str]) -> Option { let samples: Vec<&str> = samples.iter().copied().filter(|s| !s.is_empty()).collect(); if samples.is_empty() { return None; } // Try up to 10 samples for efficiency let test_samples: Vec<&str> = samples.into_iter().take(10).collect(); for fmt in DATE_FORMATS { if test_samples .iter() .all(|s| NaiveDate::parse_from_str(s, fmt).is_ok()) { return Some(fmt.to_string()); } } None } /// Parse a date string and extract a component value. pub fn extract_date_component( value: &str, format: &str, component: DateComponent, ) -> Option { let date = NaiveDate::parse_from_str(value, format).ok()?; Some(match component { DateComponent::Year => format!("{}", date.format("%Y")), DateComponent::Month => format!("{}", date.format("%Y-%m")), DateComponent::Quarter => { let q = (date.month0() / 3) + 1; format!("{}-Q{}", date.format("%Y"), q) } }) } const CATEGORY_THRESHOLD: usize = 20; pub fn analyze_records(records: &[Value]) -> Vec { if records.is_empty() { return vec![]; } // Collect all field names let mut fields: Vec = Vec::new(); for record in records { if let Value::Object(map) = record { for key in map.keys() { if !fields.contains(key) { fields.push(key.clone()); } } } } fields .into_iter() .map(|field| { let values: Vec<&Value> = records.iter().filter_map(|r| r.get(&field)).collect(); let all_numeric = values.iter().all(|v| v.is_number()); let all_string = values.iter().all(|v| v.is_string()); if all_numeric { return FieldProposal { field, kind: FieldKind::Measure, distinct_values: vec![], accepted: true, date_format: None, date_components: vec![], }; } if all_string { let distinct: HashSet<&str> = values.iter().filter_map(|v| v.as_str()).collect(); let distinct_vec: Vec = distinct.into_iter().map(String::from).collect(); let n = distinct_vec.len(); // Try chrono-based date detection let samples: Vec<&str> = distinct_vec.iter().map(|s| s.as_str()).collect(); let date_format = detect_date_format(&samples); if date_format.is_some() { return FieldProposal { field, kind: FieldKind::TimeCategory, distinct_values: distinct_vec, accepted: true, date_format, date_components: vec![], }; } if n <= CATEGORY_THRESHOLD { return FieldProposal { field, kind: FieldKind::Category, distinct_values: distinct_vec, accepted: true, date_format: None, date_components: vec![], }; } return FieldProposal { field, kind: FieldKind::Label, distinct_values: distinct_vec, accepted: false, date_format: None, date_components: vec![], }; } // Mixed or other: treat as label FieldProposal { field, kind: FieldKind::Label, distinct_values: vec![], accepted: false, date_format: None, date_components: vec![], } }) .collect() } /// Extract nested array from JSON by dot-path pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec> { if path.is_empty() { return value.as_array(); } let mut current = value; for part in path.split('.') { current = current.get(part)?; } current.as_array() } /// Find candidate paths to arrays in JSON pub fn find_array_paths(value: &Value) -> Vec { let mut paths = Vec::new(); find_array_paths_inner(value, "", &mut paths); paths } fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec) { match value { Value::Array(_) => { paths.push(prefix.to_string()); } Value::Object(map) => { for (key, val) in map { let path = if prefix.is_empty() { key.clone() } else { format!("{prefix}.{key}") }; find_array_paths_inner(val, &path, paths); } } _ => {} } } #[cfg(test)] mod tests { use super::*; #[test] fn detect_iso_date_format() { let samples = vec!["2025-01-15", "2025-02-28", "2024-12-01"]; assert_eq!(detect_date_format(&samples), Some("%Y-%m-%d".to_string())); } #[test] fn detect_us_date_format() { let samples = vec!["03/31/2026", "01/15/2025", "12/25/2024"]; assert_eq!(detect_date_format(&samples), Some("%m/%d/%Y".to_string())); } #[test] fn detect_short_year_format() { // Two-digit years are ambiguous with four-digit format, so %m/%d/%Y // matches first. This is expected — the user can override in the wizard. let samples = vec!["03/31/26", "01/15/25"]; assert!(detect_date_format(&samples).is_some()); } #[test] fn detect_no_date_format() { let samples = vec!["hello", "world"]; assert_eq!(detect_date_format(&samples), None); } #[test] fn extract_year_component() { let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Year); assert_eq!(result, Some("2026".to_string())); } #[test] fn extract_month_component() { let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Month); assert_eq!(result, Some("2026-03".to_string())); } #[test] fn extract_quarter_component() { let result = extract_date_component("03/31/2026", "%m/%d/%Y", DateComponent::Quarter); assert_eq!(result, Some("2026-Q1".to_string())); } #[test] fn extract_quarter_q4() { let result = extract_date_component("12/15/2025", "%m/%d/%Y", DateComponent::Quarter); assert_eq!(result, Some("2025-Q4".to_string())); } #[test] fn analyze_detects_time_category_with_format() { let records: Vec = vec![ serde_json::json!({"Date": "01/15/2025", "Amount": 100}), serde_json::json!({"Date": "02/20/2025", "Amount": 200}), ]; let proposals = analyze_records(&records); let date_prop = proposals.iter().find(|p| p.field == "Date").unwrap(); assert_eq!(date_prop.kind, FieldKind::TimeCategory); assert_eq!(date_prop.date_format, Some("%m/%d/%Y".to_string())); } }