feat: intern cell keys for O(1) comparison and indexing
Refactor DataStore to use interned keys (InternedKey) instead of string-based CellKey for O(1) hash and compare operations. Introduce SymbolTable-backed interning for all category and item names, storing them as Symbol identifiers throughout the data structure. Add secondary index mapping (category, item) pairs to sets of interned keys, enabling efficient partial match queries without scanning all cells. Optimize matching_values() to avoid allocating CellKey strings by working directly with interned keys and intersecting index sets. Update all callers to use new API: iter_cells(), matching_values(), and internal lookup_key() helper. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/Qwen3.5-35B-A3B-GGUF:Q5_K_M)
This commit is contained in:
@ -1,5 +1,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use super::symbol::{Symbol, SymbolTable};
|
||||
|
||||
/// A cell key is a sorted vector of (category_name, item_name) pairs.
|
||||
/// Sorted by category name for canonical form.
|
||||
@ -41,6 +43,7 @@ impl CellKey {
|
||||
)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn matches_partial(&self, partial: &[(String, String)]) -> bool {
|
||||
partial
|
||||
.iter()
|
||||
@ -85,11 +88,22 @@ impl std::fmt::Display for CellValue {
|
||||
}
|
||||
}
|
||||
|
||||
/// Interned representation of a CellKey — cheap to hash and compare.
|
||||
/// Sorted by first element (category Symbol) for canonical form.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct InternedKey(pub Vec<(Symbol, Symbol)>);
|
||||
|
||||
|
||||
/// Serialized as a list of (key, value) pairs so CellKey doesn't need
|
||||
/// to implement the `Serialize`-as-string requirement for JSON object keys.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct DataStore {
|
||||
cells: HashMap<CellKey, CellValue>,
|
||||
/// Primary storage — interned keys for O(1) hash/compare.
|
||||
cells: HashMap<InternedKey, CellValue>,
|
||||
/// String interner — all category/item names are interned here.
|
||||
pub symbols: SymbolTable,
|
||||
/// Secondary index: interned (category, item) → set of interned keys.
|
||||
index: HashMap<(Symbol, Symbol), HashSet<InternedKey>>,
|
||||
}
|
||||
|
||||
impl Serialize for DataStore {
|
||||
@ -97,7 +111,8 @@ impl Serialize for DataStore {
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = s.serialize_seq(Some(self.cells.len()))?;
|
||||
for (k, v) in &self.cells {
|
||||
seq.serialize_element(&(k, v))?;
|
||||
let cell_key = self.to_cell_key(k);
|
||||
seq.serialize_element(&(cell_key, v))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
@ -106,8 +121,11 @@ impl Serialize for DataStore {
|
||||
impl<'de> Deserialize<'de> for DataStore {
|
||||
fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
|
||||
let pairs: Vec<(CellKey, CellValue)> = Vec::deserialize(d)?;
|
||||
let cells: HashMap<CellKey, CellValue> = pairs.into_iter().collect();
|
||||
Ok(DataStore { cells })
|
||||
let mut store = DataStore::default();
|
||||
for (key, value) in pairs {
|
||||
store.set(key, value);
|
||||
}
|
||||
Ok(store)
|
||||
}
|
||||
}
|
||||
|
||||
@ -116,27 +134,145 @@ impl DataStore {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Intern a CellKey into an InternedKey.
|
||||
pub fn intern_key(&mut self, key: &CellKey) -> InternedKey {
|
||||
InternedKey(self.symbols.intern_coords(&key.0))
|
||||
}
|
||||
|
||||
/// Convert an InternedKey back to a CellKey (string form).
|
||||
pub fn to_cell_key(&self, ikey: &InternedKey) -> CellKey {
|
||||
CellKey(
|
||||
ikey.0
|
||||
.iter()
|
||||
.map(|(c, i)| {
|
||||
(
|
||||
self.symbols.resolve(*c).to_string(),
|
||||
self.symbols.resolve(*i).to_string(),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn set(&mut self, key: CellKey, value: CellValue) {
|
||||
self.cells.insert(key, value);
|
||||
let ikey = self.intern_key(&key);
|
||||
// Update index for each coordinate pair
|
||||
for pair in &ikey.0 {
|
||||
self.index.entry(*pair).or_default().insert(ikey.clone());
|
||||
}
|
||||
self.cells.insert(ikey, value);
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &CellKey) -> Option<&CellValue> {
|
||||
self.cells.get(key)
|
||||
let ikey = self.lookup_key(key)?;
|
||||
self.cells.get(&ikey)
|
||||
}
|
||||
|
||||
pub fn cells(&self) -> &HashMap<CellKey, CellValue> {
|
||||
&self.cells
|
||||
/// Look up an InternedKey for a CellKey without interning new symbols.
|
||||
fn lookup_key(&self, key: &CellKey) -> Option<InternedKey> {
|
||||
let pairs: Option<Vec<(Symbol, Symbol)>> = key
|
||||
.0
|
||||
.iter()
|
||||
.map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?)))
|
||||
.collect();
|
||||
pairs.map(InternedKey)
|
||||
}
|
||||
|
||||
/// Iterate over all cells, yielding (CellKey, &CellValue) pairs.
|
||||
pub fn iter_cells(&self) -> impl Iterator<Item = (CellKey, &CellValue)> {
|
||||
self.cells
|
||||
.iter()
|
||||
.map(|(k, v)| (self.to_cell_key(k), v))
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, key: &CellKey) {
|
||||
self.cells.remove(key);
|
||||
let Some(ikey) = self.lookup_key(key) else {
|
||||
return;
|
||||
};
|
||||
if self.cells.remove(&ikey).is_some() {
|
||||
for pair in &ikey.0 {
|
||||
if let Some(set) = self.index.get_mut(pair) {
|
||||
set.remove(&ikey);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// All cells where partial coords match
|
||||
pub fn matching_cells(&self, partial: &[(String, String)]) -> Vec<(&CellKey, &CellValue)> {
|
||||
self.cells
|
||||
/// Values of all cells where every coordinate in `partial` matches.
|
||||
/// Hot path: avoids allocating CellKey for each result.
|
||||
pub fn matching_values(&self, partial: &[(String, String)]) -> Vec<&CellValue> {
|
||||
if partial.is_empty() {
|
||||
return self.cells.values().collect();
|
||||
}
|
||||
|
||||
// Intern the partial key (lookup only, no new symbols)
|
||||
let interned_partial: Vec<(Symbol, Symbol)> = partial
|
||||
.iter()
|
||||
.filter(|(key, _)| key.matches_partial(partial))
|
||||
.filter_map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?)))
|
||||
.collect();
|
||||
|
||||
if interned_partial.len() < partial.len() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut sets: Vec<&HashSet<InternedKey>> = interned_partial
|
||||
.iter()
|
||||
.filter_map(|pair| self.index.get(pair))
|
||||
.collect();
|
||||
|
||||
if sets.len() < interned_partial.len() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
sets.sort_by_key(|s| s.len());
|
||||
let first = sets[0];
|
||||
let rest = &sets[1..];
|
||||
|
||||
first
|
||||
.iter()
|
||||
.filter(|ikey| rest.iter().all(|s| s.contains(*ikey)))
|
||||
.filter_map(|ikey| self.cells.get(ikey))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// All cells where every coordinate in `partial` matches.
|
||||
/// Allocates CellKey strings for each match — use `matching_values`
|
||||
/// if you only need values.
|
||||
#[allow(dead_code)]
|
||||
pub fn matching_cells(&self, partial: &[(String, String)]) -> Vec<(CellKey, &CellValue)> {
|
||||
if partial.is_empty() {
|
||||
return self.iter_cells().collect();
|
||||
}
|
||||
|
||||
let interned_partial: Vec<(Symbol, Symbol)> = partial
|
||||
.iter()
|
||||
.filter_map(|(c, i)| Some((self.symbols.get(c)?, self.symbols.get(i)?)))
|
||||
.collect();
|
||||
|
||||
if interned_partial.len() < partial.len() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut sets: Vec<&HashSet<InternedKey>> = interned_partial
|
||||
.iter()
|
||||
.filter_map(|pair| self.index.get(pair))
|
||||
.collect();
|
||||
|
||||
if sets.len() < interned_partial.len() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
sets.sort_by_key(|s| s.len());
|
||||
let first = sets[0];
|
||||
let rest = &sets[1..];
|
||||
|
||||
first
|
||||
.iter()
|
||||
.filter(|ikey| rest.iter().all(|s| s.contains(*ikey)))
|
||||
.filter_map(|ikey| {
|
||||
let value = self.cells.get(ikey)?;
|
||||
Some((self.to_cell_key(ikey), value))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
@ -285,7 +421,7 @@ mod data_store {
|
||||
let k = key(&[("Region", "East")]);
|
||||
store.set(k.clone(), CellValue::Number(5.0));
|
||||
store.remove(&k);
|
||||
assert!(store.cells().is_empty());
|
||||
assert!(store.iter_cells().next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@ -200,9 +200,9 @@ impl Model {
|
||||
// Aggregate raw data across all None-axis categories
|
||||
let values: Vec<f64> = self
|
||||
.data
|
||||
.matching_cells(&key.0)
|
||||
.matching_values(&key.0)
|
||||
.into_iter()
|
||||
.filter_map(|(_, v)| v.as_f64())
|
||||
.filter_map(|v| v.as_f64())
|
||||
.collect();
|
||||
|
||||
if values.is_empty() {
|
||||
@ -303,9 +303,9 @@ impl Model {
|
||||
}
|
||||
let values: Vec<f64> = model
|
||||
.data
|
||||
.matching_cells(&partial.0)
|
||||
.matching_values(&partial.0)
|
||||
.into_iter()
|
||||
.filter_map(|(_, v)| v.as_f64())
|
||||
.filter_map(|v| v.as_f64())
|
||||
.collect();
|
||||
match func {
|
||||
AggFunc::Sum => Some(values.iter().sum()),
|
||||
|
||||
@ -88,7 +88,7 @@ pub fn format_md(model: &Model) -> String {
|
||||
}
|
||||
|
||||
// Data — sorted by coordinate string for deterministic diffs
|
||||
let mut cells: Vec<_> = model.data.cells().iter().collect();
|
||||
let mut cells: Vec<_> = model.data.iter_cells().collect();
|
||||
cells.sort_by_key(|(k, _)| coord_str(k));
|
||||
if !cells.is_empty() {
|
||||
writeln!(out, "\n## Data").unwrap();
|
||||
@ -97,7 +97,7 @@ pub fn format_md(model: &Model) -> String {
|
||||
CellValue::Number(_) => value.to_string(),
|
||||
CellValue::Text(s) => format!("\"{}\"", s),
|
||||
};
|
||||
writeln!(out, "{} = {}", coord_str(key), val_str).unwrap();
|
||||
writeln!(out, "{} = {}", coord_str(&key), val_str).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@ -461,7 +461,7 @@ pub fn export_csv(model: &Model, view_name: &str, path: &Path) -> Result<()> {
|
||||
.map(|ci| {
|
||||
layout
|
||||
.cell_key(ri, ci)
|
||||
.and_then(|key| model.evaluate(&key))
|
||||
.and_then(|key| model.evaluate_aggregated(&key, &layout.none_cats))
|
||||
.map(|v| v.to_string())
|
||||
.unwrap_or_default()
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user