Files
improvise/src/import/analyzer.rs
Ed L eae00522e2 Initial implementation of Improvise TUI
Multi-dimensional data modeling terminal application with:
- Core data model: categories, items, groups, sparse cell store
- Formula system: recursive-descent parser, named formulas, WHERE clauses
- View system: Row/Column/Page axes, tile-based pivot, page slicing
- JSON import wizard (interactive TUI + headless auto-mode)
- Command layer: all mutations via typed Command enum for headless replay
- TUI: Ratatui grid, tile bar, formula/category/view panels, help overlay
- Persistence: .improv (JSON), .improv.gz (gzip), CSV export, autosave
- Static binary via x86_64-unknown-linux-musl + nix flake devShell
- Headless mode: --cmd '{"op":"..."}' and --script file.jsonl

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 21:11:55 -07:00

161 lines
4.5 KiB
Rust

use std::collections::HashSet;
use serde_json::Value;
#[derive(Debug, Clone, PartialEq)]
pub enum FieldKind {
/// Small number of distinct string values → dimension/category
Category,
/// Numeric values → measure
Measure,
/// Date/time strings → time category
TimeCategory,
/// Many unique strings (IDs, names) → label/identifier
Label,
}
#[derive(Debug, Clone)]
pub struct FieldProposal {
pub field: String,
pub kind: FieldKind,
pub distinct_values: Vec<String>,
pub accepted: bool,
}
impl FieldProposal {
pub fn kind_label(&self) -> &'static str {
match self.kind {
FieldKind::Category => "Category (dimension)",
FieldKind::Measure => "Measure (numeric)",
FieldKind::TimeCategory => "Time Category",
FieldKind::Label => "Label/Identifier (skip)",
}
}
}
const CATEGORY_THRESHOLD: usize = 20;
const LABEL_THRESHOLD: usize = 50;
pub fn analyze_records(records: &[Value]) -> Vec<FieldProposal> {
if records.is_empty() {
return vec![];
}
// Collect all field names
let mut fields: Vec<String> = Vec::new();
for record in records {
if let Value::Object(map) = record {
for key in map.keys() {
if !fields.contains(key) {
fields.push(key.clone());
}
}
}
}
fields.into_iter().map(|field| {
let values: Vec<&Value> = records.iter()
.filter_map(|r| r.get(&field))
.collect();
let all_numeric = values.iter().all(|v| v.is_number());
let all_string = values.iter().all(|v| v.is_string());
if all_numeric {
return FieldProposal {
field,
kind: FieldKind::Measure,
distinct_values: vec![],
accepted: true,
};
}
if all_string {
let distinct: HashSet<&str> = values.iter()
.filter_map(|v| v.as_str())
.collect();
let distinct_vec: Vec<String> = distinct.into_iter().map(String::from).collect();
let n = distinct_vec.len();
let total = values.len();
// Check if looks like date
let looks_like_date = distinct_vec.iter().any(|s| {
s.contains('-') && s.len() >= 8
|| s.starts_with("Q") && s.len() == 2
|| ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
.iter().any(|m| s.starts_with(m))
});
if looks_like_date {
return FieldProposal {
field,
kind: FieldKind::TimeCategory,
distinct_values: distinct_vec,
accepted: true,
};
}
if n <= CATEGORY_THRESHOLD {
return FieldProposal {
field,
kind: FieldKind::Category,
distinct_values: distinct_vec,
accepted: true,
};
}
return FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: distinct_vec,
accepted: false,
};
}
// Mixed or other: treat as label
FieldProposal {
field,
kind: FieldKind::Label,
distinct_values: vec![],
accepted: false,
}
}).collect()
}
/// Extract nested array from JSON by dot-path
pub fn extract_array_at_path<'a>(value: &'a Value, path: &str) -> Option<&'a Vec<Value>> {
if path.is_empty() {
return value.as_array();
}
let mut current = value;
for part in path.split('.') {
current = current.get(part)?;
}
current.as_array()
}
/// Find candidate paths to arrays in JSON
pub fn find_array_paths(value: &Value) -> Vec<String> {
let mut paths = Vec::new();
find_array_paths_inner(value, "", &mut paths);
paths
}
fn find_array_paths_inner(value: &Value, prefix: &str, paths: &mut Vec<String>) {
match value {
Value::Array(_) => {
paths.push(prefix.to_string());
}
Value::Object(map) => {
for (key, val) in map {
let path = if prefix.is_empty() {
key.clone()
} else {
format!("{prefix}.{key}")
};
find_array_paths_inner(val, &path, paths);
}
}
_ => {}
}
}