diff --git a/src/model/cell.rs b/src/model/cell.rs index a3cf63e..c290f84 100644 --- a/src/model/cell.rs +++ b/src/model/cell.rs @@ -1,5 +1,7 @@ use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; + +use super::symbol::{Symbol, SymbolTable}; /// A cell key is a sorted vector of (category_name, item_name) pairs. /// Sorted by category name for canonical form. @@ -41,6 +43,7 @@ impl CellKey { ) } + #[allow(dead_code)] pub fn matches_partial(&self, partial: &[(String, String)]) -> bool { partial .iter() @@ -85,11 +88,22 @@ impl std::fmt::Display for CellValue { } } +/// Interned representation of a CellKey — cheap to hash and compare. +/// Sorted by first element (category Symbol) for canonical form. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct InternedKey(pub Vec<(Symbol, Symbol)>); + + /// Serialized as a list of (key, value) pairs so CellKey doesn't need /// to implement the `Serialize`-as-string requirement for JSON object keys. #[derive(Debug, Clone, Default)] pub struct DataStore { - cells: HashMap, + /// Primary storage — interned keys for O(1) hash/compare. + cells: HashMap, + /// String interner — all category/item names are interned here. + pub symbols: SymbolTable, + /// Secondary index: interned (category, item) → set of interned keys. + index: HashMap<(Symbol, Symbol), HashSet>, } impl Serialize for DataStore { @@ -97,7 +111,8 @@ impl Serialize for DataStore { use serde::ser::SerializeSeq; let mut seq = s.serialize_seq(Some(self.cells.len()))?; for (k, v) in &self.cells { - seq.serialize_element(&(k, v))?; + let cell_key = self.to_cell_key(k); + seq.serialize_element(&(cell_key, v))?; } seq.end() } @@ -106,8 +121,11 @@ impl Serialize for DataStore { impl<'de> Deserialize<'de> for DataStore { fn deserialize>(d: D) -> Result { let pairs: Vec<(CellKey, CellValue)> = Vec::deserialize(d)?; - let cells: HashMap = pairs.into_iter().collect(); - Ok(DataStore { cells }) + let mut store = DataStore::default(); + for (key, value) in pairs { + store.set(key, value); + } + Ok(store) } } @@ -116,27 +134,145 @@ impl DataStore { Self::default() } + /// Intern a CellKey into an InternedKey. + pub fn intern_key(&mut self, key: &CellKey) -> InternedKey { + InternedKey(self.symbols.intern_coords(&key.0)) + } + + /// Convert an InternedKey back to a CellKey (string form). + pub fn to_cell_key(&self, ikey: &InternedKey) -> CellKey { + CellKey( + ikey.0 + .iter() + .map(|(c, i)| { + ( + self.symbols.resolve(*c).to_string(), + self.symbols.resolve(*i).to_string(), + ) + }) + .collect(), + ) + } + pub fn set(&mut self, key: CellKey, value: CellValue) { - self.cells.insert(key, value); + let ikey = self.intern_key(&key); + // Update index for each coordinate pair + for pair in &ikey.0 { + self.index.entry(*pair).or_default().insert(ikey.clone()); + } + self.cells.insert(ikey, value); } pub fn get(&self, key: &CellKey) -> Option<&CellValue> { - self.cells.get(key) + let ikey = self.lookup_key(key)?; + self.cells.get(&ikey) } - pub fn cells(&self) -> &HashMap { - &self.cells + /// Look up an InternedKey for a CellKey without interning new symbols. + fn lookup_key(&self, key: &CellKey) -> Option { + let pairs: Option> = key + .0 + .iter() + .map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?))) + .collect(); + pairs.map(InternedKey) + } + + /// Iterate over all cells, yielding (CellKey, &CellValue) pairs. + pub fn iter_cells(&self) -> impl Iterator { + self.cells + .iter() + .map(|(k, v)| (self.to_cell_key(k), v)) } pub fn remove(&mut self, key: &CellKey) { - self.cells.remove(key); + let Some(ikey) = self.lookup_key(key) else { + return; + }; + if self.cells.remove(&ikey).is_some() { + for pair in &ikey.0 { + if let Some(set) = self.index.get_mut(pair) { + set.remove(&ikey); + } + } + } } - /// All cells where partial coords match - pub fn matching_cells(&self, partial: &[(String, String)]) -> Vec<(&CellKey, &CellValue)> { - self.cells + /// Values of all cells where every coordinate in `partial` matches. + /// Hot path: avoids allocating CellKey for each result. + pub fn matching_values(&self, partial: &[(String, String)]) -> Vec<&CellValue> { + if partial.is_empty() { + return self.cells.values().collect(); + } + + // Intern the partial key (lookup only, no new symbols) + let interned_partial: Vec<(Symbol, Symbol)> = partial .iter() - .filter(|(key, _)| key.matches_partial(partial)) + .filter_map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?))) + .collect(); + + if interned_partial.len() < partial.len() { + return vec![]; + } + + let mut sets: Vec<&HashSet> = interned_partial + .iter() + .filter_map(|pair| self.index.get(pair)) + .collect(); + + if sets.len() < interned_partial.len() { + return vec![]; + } + + sets.sort_by_key(|s| s.len()); + let first = sets[0]; + let rest = &sets[1..]; + + first + .iter() + .filter(|ikey| rest.iter().all(|s| s.contains(*ikey))) + .filter_map(|ikey| self.cells.get(ikey)) + .collect() + } + + /// All cells where every coordinate in `partial` matches. + /// Allocates CellKey strings for each match — use `matching_values` + /// if you only need values. + #[allow(dead_code)] + pub fn matching_cells(&self, partial: &[(String, String)]) -> Vec<(CellKey, &CellValue)> { + if partial.is_empty() { + return self.iter_cells().collect(); + } + + let interned_partial: Vec<(Symbol, Symbol)> = partial + .iter() + .filter_map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?))) + .collect(); + + if interned_partial.len() < partial.len() { + return vec![]; + } + + let mut sets: Vec<&HashSet> = interned_partial + .iter() + .filter_map(|pair| self.index.get(pair)) + .collect(); + + if sets.len() < interned_partial.len() { + return vec![]; + } + + sets.sort_by_key(|s| s.len()); + let first = sets[0]; + let rest = &sets[1..]; + + first + .iter() + .filter(|ikey| rest.iter().all(|s| s.contains(*ikey))) + .filter_map(|ikey| { + let value = self.cells.get(ikey)?; + Some((self.to_cell_key(ikey), value)) + }) .collect() } } @@ -285,7 +421,7 @@ mod data_store { let k = key(&[("Region", "East")]); store.set(k.clone(), CellValue::Number(5.0)); store.remove(&k); - assert!(store.cells().is_empty()); + assert!(store.iter_cells().next().is_none()); } #[test] diff --git a/src/model/types.rs b/src/model/types.rs index 767a3b5..6f3451a 100644 --- a/src/model/types.rs +++ b/src/model/types.rs @@ -200,9 +200,9 @@ impl Model { // Aggregate raw data across all None-axis categories let values: Vec = self .data - .matching_cells(&key.0) + .matching_values(&key.0) .into_iter() - .filter_map(|(_, v)| v.as_f64()) + .filter_map(|v| v.as_f64()) .collect(); if values.is_empty() { @@ -303,9 +303,9 @@ impl Model { } let values: Vec = model .data - .matching_cells(&partial.0) + .matching_values(&partial.0) .into_iter() - .filter_map(|(_, v)| v.as_f64()) + .filter_map(|v| v.as_f64()) .collect(); match func { AggFunc::Sum => Some(values.iter().sum()), diff --git a/src/persistence/mod.rs b/src/persistence/mod.rs index 26acc62..80b3482 100644 --- a/src/persistence/mod.rs +++ b/src/persistence/mod.rs @@ -88,7 +88,7 @@ pub fn format_md(model: &Model) -> String { } // Data — sorted by coordinate string for deterministic diffs - let mut cells: Vec<_> = model.data.cells().iter().collect(); + let mut cells: Vec<_> = model.data.iter_cells().collect(); cells.sort_by_key(|(k, _)| coord_str(k)); if !cells.is_empty() { writeln!(out, "\n## Data").unwrap(); @@ -97,7 +97,7 @@ pub fn format_md(model: &Model) -> String { CellValue::Number(_) => value.to_string(), CellValue::Text(s) => format!("\"{}\"", s), }; - writeln!(out, "{} = {}", coord_str(key), val_str).unwrap(); + writeln!(out, "{} = {}", coord_str(&key), val_str).unwrap(); } } @@ -461,7 +461,7 @@ pub fn export_csv(model: &Model, view_name: &str, path: &Path) -> Result<()> { .map(|ci| { layout .cell_key(ri, ci) - .and_then(|key| model.evaluate(&key)) + .and_then(|key| model.evaluate_aggregated(&key, &layout.none_cats)) .map(|v| v.to_string()) .unwrap_or_default() })