refactor(core): use epsilon for float comparison and IndexSet for stem collection

Use `FLOAT_EQ_EPSILON` for equality/inequality operators and
division-by-zero guards in formula evaluation to ensure consistent
semantics.

Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas`
to improve performance from O(n²) to O(n).

Add regression tests for epsilon-based comparison and stem collection
performance.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
Edward Langley
2026-06-09 21:43:13 -07:00
parent 0fe04de53e
commit 45bfe2c4c7
+127 -12
View File
@@ -1,7 +1,7 @@
use std::collections::HashMap;
use anyhow::{Result, anyhow};
use indexmap::IndexMap;
use indexmap::{IndexMap, IndexSet};
use serde::{Deserialize, Serialize};
use super::category::{Category, CategoryId};
@@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula};
const MAX_CATEGORIES: usize = 12;
/// Tolerance for float comparison in formula evaluation. The `=`/`!=`
/// operators and the division-by-zero guard share this epsilon so the
/// semantics agree: any value that `x = 0` treats as zero is also rejected
/// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the
/// else branch's division with an x its own condition called zero.
/// 1e-10 is far below typical data magnitudes but absorbs accumulated
/// f64 rounding noise from chained formulas.
const FLOAT_EQ_EPSILON: f64 = 1e-10;
/// Pure-data document model: categories, cells, and formulas.
///
/// `Model` intentionally does **not** know about views. The view axes and
@@ -342,8 +351,9 @@ impl Model {
.map(|f| (f.target_category.clone(), f.target.clone()))
.collect();
// Gather all unique partial keys (stems) for each formula category
let mut stems: Vec<CellKey> = Vec::new();
// Gather all unique partial keys (stems) for each formula category.
// IndexSet dedupes in O(1) per cell while preserving insertion order.
let mut stems: IndexSet<CellKey> = IndexSet::new();
for (target_cat, _) in &formula_cats {
for (key, _) in self.data.iter_cells() {
let stem = key.without(target_cat);
@@ -353,9 +363,7 @@ impl Model {
for nc in none_cats {
stripped = stripped.without(nc);
}
if !stems.contains(&stripped) {
stems.push(stripped);
}
stems.insert(stripped);
}
}
@@ -470,7 +478,7 @@ impl Model {
BinOp::Sub => Ok(lv - rv),
BinOp::Mul => Ok(lv * rv),
BinOp::Div => {
if rv == 0.0 {
if rv.abs() < FLOAT_EQ_EPSILON {
Err("div/0".into())
} else {
Ok(lv / rv)
@@ -552,8 +560,8 @@ impl Model {
let lv = eval_expr_cached(l, context, model, target_category, none_cats)?;
let rv = eval_expr_cached(r, context, model, target_category, none_cats)?;
match op {
BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
BinOp::Lt => Ok(lv < rv),
BinOp::Gt => Ok(lv > rv),
BinOp::Le => Ok(lv <= rv),
@@ -671,7 +679,7 @@ impl Model {
BinOp::Sub => Ok(lv - rv),
BinOp::Mul => Ok(lv * rv),
BinOp::Div => {
if rv == 0.0 {
if rv.abs() < FLOAT_EQ_EPSILON {
Err("div/0".into())
} else {
Ok(lv / rv)
@@ -746,8 +754,8 @@ impl Model {
let lv = eval_expr(l, context, model, target_category, depth)?;
let rv = eval_expr(r, context, model, target_category, depth)?;
match op {
BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
BinOp::Lt => Ok(lv < rv),
BinOp::Gt => Ok(lv > rv),
BinOp::Le => Ok(lv <= rv),
@@ -1534,6 +1542,113 @@ mod formula_tests {
Some(CellValue::Number(2.0))
);
}
/// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the
/// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11
/// the comparison `X = 0` is true, yet `100 / X` happily divided —
/// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the
/// equality operator treats as zero must also trigger div/0.
/// This exercises the direct eval path (`eval_expr` in
/// `eval_formula_depth`).
#[test]
fn near_zero_divisor_consistent_with_equality_epsilon() {
let mut m = Model::new("Test");
m.add_category("_Measure").unwrap();
if let Some(cat) = m.category_mut("_Measure") {
cat.add_item("X");
cat.add_item("IsZero");
cat.add_item("Ratio");
}
m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11));
m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
// Equality says X is zero…
assert_eq!(
m.evaluate(&coord(&[("_Measure", "IsZero")])),
Some(CellValue::Number(1.0))
);
// …so dividing by X must be div/0, not 1e13.
// Bug: returns Number(1e13) — guard uses exact == 0.0.
assert_eq!(
m.evaluate(&coord(&[("_Measure", "Ratio")])),
Some(CellValue::Error("div/0".into()))
);
}
/// Same bug (improvise-0bf), fixed-point/cached eval path
/// (`eval_expr_cached` in `eval_formula_with_cache`): its division
/// guard also used exact `== 0.0` while `=` used the epsilon.
#[test]
fn near_zero_divisor_consistent_in_cached_eval_path() {
let mut m = Model::new("Test");
m.add_category("_Measure").unwrap();
m.add_category("Region").unwrap();
if let Some(cat) = m.category_mut("_Measure") {
cat.add_item("X");
cat.add_item("IsZero");
cat.add_item("Ratio");
}
if let Some(cat) = m.category_mut("Region") {
cat.add_item("East");
}
m.set_cell(
coord(&[("_Measure", "X"), ("Region", "East")]),
CellValue::Number(1e-11),
);
m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
let none_cats = vec!["Region".to_string()];
m.recompute_formulas(&none_cats);
// Equality says X is zero…
assert_eq!(
m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats),
Some(CellValue::Number(1.0))
);
// …so dividing by X must be div/0 in the cached path too.
assert_eq!(
m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats),
Some(CellValue::Error("div/0".into()))
);
}
/// Regression guard for improvise-6os: stem collection in
/// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over
/// all data cells per formula category). Replaced with an `IndexSet` —
/// a pure perf change. This pins the observable contract: recomputed
/// values for a multi-stem model (chained formulas, duplicate stems
/// from multiple measures per stem) are unchanged and stable across
/// repeated recomputes.
#[test]
fn recompute_formulas_multi_stem_values_stable() {
let mut m = revenue_cost_model();
m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap());
m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap());
m.recompute_formulas(&[]);
let cases = [
(("Profit", "East"), 400.0),
(("Profit", "West"), 300.0),
(("Margin", "East"), 0.4),
(("Margin", "West"), 0.375),
];
for ((target, region), expected) in cases {
let key = coord(&[("_Measure", target), ("Region", region)]);
let val = m
.formula_cache
.get(&key)
.and_then(|v| v.as_f64())
.unwrap_or_else(|| panic!("missing cache entry for {target}/{region}"));
assert!(
approx_eq(val, expected),
"{target}/{region}: expected {expected}, got {val}"
);
}
// Recomputing again reproduces the exact same cache (determinism).
let snapshot = m.formula_cache.clone();
m.recompute_formulas(&[]);
assert_eq!(m.formula_cache, snapshot);
}
}
#[cfg(test)]