use std::collections::HashSet; use serde_json::Value; #[derive(Debug, Clone, PartialEq)] pub enum FieldKind { /// Small number of distinct string values → dimension/category Category, /// Numeric values → measure Measure, /// Date/time strings → time category TimeCategory, /// Many unique strings (IDs, names) → label/identifier Label, } #[derive(Debug, Clone)] pub struct FieldProposal { pub field: String, pub kind: FieldKind, pub distinct_values: Vec, pub accepted: bool, } impl FieldProposal { pub fn kind_label(&self) -> &'static str { match self.kind { FieldKind::Category => "Category (dimension)", FieldKind::Measure => "Measure (numeric)", FieldKind::TimeCategory => "Time Category", FieldKind::Label => "Label/Identifier (skip)", } } } const CATEGORY_THRESHOLD: usize = 20; const LABEL_THRESHOLD: usize = 50; pub fn analyze_records(records: &[Value]) -> Vec { if records.is_empty() { return vec![]; } // Collect all field names let mut fields: Vec = Vec::new(); for record in records { if let Value::Object(map) = record { for key in map.keys() { if !fields.contains(key) { fields.push(key.clone()); } } } } fields.into_iter().map(|field| { let values: Vec<&Value> = records.iter() .filter_map(|r| r.get(&field)) .collect(); let all_numeric = values.iter().all(|v| v.is_number()); let all_string = values.iter().all(|v| v.is_string()); if all_numeric { return FieldProposal { field, kind: FieldKind::Measure, distinct_values: vec![], accepted: true, }; } if all_string { let distinct: HashSet<&str> = values.iter() .filter_map(|v| v.as_str()) .collect(); let distinct_vec: Vec = distinct.into_iter().map(String::from).collect(); let n = distinct_vec.len(); let _total = values.len(); // Check if looks like date let looks_like_date = distinct_vec.iter().any(|s| { s.contains('-') && s.len() >= 8 || s.starts_with("Q") && s.len() == 2 || ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"] .iter().any(|m| s.starts_with(m)) }); if looks_like_date { return FieldProposal { field, kind: FieldKind::TimeCategory, distinct_values: distinct_vec, accepted: true, }; } if n <= CATEGORY_THRESHOLD { return FieldProposal { field, kind: FieldKind::Category, distinct_values: distinct_vec, accepted: true, }; } return FieldProposal { field, kind: FieldKind::Label, distinct_values: distinct_vec, accepted: false, }; } // Mixed or other: treat as label FieldProposal { field, kind: FieldKind::Label, distinct_values: vec![], accepted: false, } }).collect() } /// Extract nested array from JSON by dot-path pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec> { if path.is_empty() { return value.as_array(); } let mut current = value; for part in path.split('.') { current = current.get(part)?; } current.as_array() } /// Find candidate paths to arrays in JSON pub fn find_array_paths(value: &Value) -> Vec { let mut paths = Vec::new(); find_array_paths_inner(value, "", &mut paths); paths } fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec) { match value { Value::Array(_) => { paths.push(prefix.to_string()); } Value::Object(map) => { for (key, val) in map { let path = if prefix.is_empty() { key.clone() } else { format!("{prefix}.{key}") }; find_array_paths_inner(val, &path, paths); } } _ => {} } }