From 45bfe2c4c76dd714dcbe034fc89ebc90f128c6f1 Mon Sep 17 00:00:00 2001 From: Edward Langley Date: Tue, 9 Jun 2026 21:43:13 -0700 Subject: [PATCH] refactor(core): use epsilon for float comparison and IndexSet for stem collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use `FLOAT_EQ_EPSILON` for equality/inequality operators and division-by-zero guards in formula evaluation to ensure consistent semantics. Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas` to improve performance from O(n²) to O(n). Add regression tests for epsilon-based comparison and stem collection performance. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL) --- crates/improvise-core/src/model/types.rs | 139 +++++++++++++++++++++-- 1 file changed, 127 insertions(+), 12 deletions(-) diff --git a/crates/improvise-core/src/model/types.rs b/crates/improvise-core/src/model/types.rs index cfc7b69..a6a6c61 100644 --- a/crates/improvise-core/src/model/types.rs +++ b/crates/improvise-core/src/model/types.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use anyhow::{Result, anyhow}; -use indexmap::IndexMap; +use indexmap::{IndexMap, IndexSet}; use serde::{Deserialize, Serialize}; use super::category::{Category, CategoryId}; @@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula}; const MAX_CATEGORIES: usize = 12; +/// Tolerance for float comparison in formula evaluation. The `=`/`!=` +/// operators and the division-by-zero guard share this epsilon so the +/// semantics agree: any value that `x = 0` treats as zero is also rejected +/// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the +/// else branch's division with an x its own condition called zero. +/// 1e-10 is far below typical data magnitudes but absorbs accumulated +/// f64 rounding noise from chained formulas. +const FLOAT_EQ_EPSILON: f64 = 1e-10; + /// Pure-data document model: categories, cells, and formulas. /// /// `Model` intentionally does **not** know about views. The view axes and @@ -342,8 +351,9 @@ impl Model { .map(|f| (f.target_category.clone(), f.target.clone())) .collect(); - // Gather all unique partial keys (stems) for each formula category - let mut stems: Vec = Vec::new(); + // Gather all unique partial keys (stems) for each formula category. + // IndexSet dedupes in O(1) per cell while preserving insertion order. + let mut stems: IndexSet = IndexSet::new(); for (target_cat, _) in &formula_cats { for (key, _) in self.data.iter_cells() { let stem = key.without(target_cat); @@ -353,9 +363,7 @@ impl Model { for nc in none_cats { stripped = stripped.without(nc); } - if !stems.contains(&stripped) { - stems.push(stripped); - } + stems.insert(stripped); } } @@ -470,7 +478,7 @@ impl Model { BinOp::Sub => Ok(lv - rv), BinOp::Mul => Ok(lv * rv), BinOp::Div => { - if rv == 0.0 { + if rv.abs() < FLOAT_EQ_EPSILON { Err("div/0".into()) } else { Ok(lv / rv) @@ -552,8 +560,8 @@ impl Model { let lv = eval_expr_cached(l, context, model, target_category, none_cats)?; let rv = eval_expr_cached(r, context, model, target_category, none_cats)?; match op { - BinOp::Eq => Ok((lv - rv).abs() < 1e-10), - BinOp::Ne => Ok((lv - rv).abs() >= 1e-10), + BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON), + BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON), BinOp::Lt => Ok(lv < rv), BinOp::Gt => Ok(lv > rv), BinOp::Le => Ok(lv <= rv), @@ -671,7 +679,7 @@ impl Model { BinOp::Sub => Ok(lv - rv), BinOp::Mul => Ok(lv * rv), BinOp::Div => { - if rv == 0.0 { + if rv.abs() < FLOAT_EQ_EPSILON { Err("div/0".into()) } else { Ok(lv / rv) @@ -746,8 +754,8 @@ impl Model { let lv = eval_expr(l, context, model, target_category, depth)?; let rv = eval_expr(r, context, model, target_category, depth)?; match op { - BinOp::Eq => Ok((lv - rv).abs() < 1e-10), - BinOp::Ne => Ok((lv - rv).abs() >= 1e-10), + BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON), + BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON), BinOp::Lt => Ok(lv < rv), BinOp::Gt => Ok(lv > rv), BinOp::Le => Ok(lv <= rv), @@ -1534,6 +1542,113 @@ mod formula_tests { Some(CellValue::Number(2.0)) ); } + + /// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the + /// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11 + /// the comparison `X = 0` is true, yet `100 / X` happily divided — + /// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the + /// equality operator treats as zero must also trigger div/0. + /// This exercises the direct eval path (`eval_expr` in + /// `eval_formula_depth`). + #[test] + fn near_zero_divisor_consistent_with_equality_epsilon() { + let mut m = Model::new("Test"); + m.add_category("_Measure").unwrap(); + if let Some(cat) = m.category_mut("_Measure") { + cat.add_item("X"); + cat.add_item("IsZero"); + cat.add_item("Ratio"); + } + m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11)); + m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap()); + m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap()); + // Equality says X is zero… + assert_eq!( + m.evaluate(&coord(&[("_Measure", "IsZero")])), + Some(CellValue::Number(1.0)) + ); + // …so dividing by X must be div/0, not 1e13. + // Bug: returns Number(1e13) — guard uses exact == 0.0. + assert_eq!( + m.evaluate(&coord(&[("_Measure", "Ratio")])), + Some(CellValue::Error("div/0".into())) + ); + } + + /// Same bug (improvise-0bf), fixed-point/cached eval path + /// (`eval_expr_cached` in `eval_formula_with_cache`): its division + /// guard also used exact `== 0.0` while `=` used the epsilon. + #[test] + fn near_zero_divisor_consistent_in_cached_eval_path() { + let mut m = Model::new("Test"); + m.add_category("_Measure").unwrap(); + m.add_category("Region").unwrap(); + if let Some(cat) = m.category_mut("_Measure") { + cat.add_item("X"); + cat.add_item("IsZero"); + cat.add_item("Ratio"); + } + if let Some(cat) = m.category_mut("Region") { + cat.add_item("East"); + } + m.set_cell( + coord(&[("_Measure", "X"), ("Region", "East")]), + CellValue::Number(1e-11), + ); + m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap()); + m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap()); + let none_cats = vec!["Region".to_string()]; + m.recompute_formulas(&none_cats); + // Equality says X is zero… + assert_eq!( + m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats), + Some(CellValue::Number(1.0)) + ); + // …so dividing by X must be div/0 in the cached path too. + assert_eq!( + m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats), + Some(CellValue::Error("div/0".into())) + ); + } + + /// Regression guard for improvise-6os: stem collection in + /// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over + /// all data cells per formula category). Replaced with an `IndexSet` — + /// a pure perf change. This pins the observable contract: recomputed + /// values for a multi-stem model (chained formulas, duplicate stems + /// from multiple measures per stem) are unchanged and stable across + /// repeated recomputes. + #[test] + fn recompute_formulas_multi_stem_values_stable() { + let mut m = revenue_cost_model(); + m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap()); + m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap()); + m.recompute_formulas(&[]); + + let cases = [ + (("Profit", "East"), 400.0), + (("Profit", "West"), 300.0), + (("Margin", "East"), 0.4), + (("Margin", "West"), 0.375), + ]; + for ((target, region), expected) in cases { + let key = coord(&[("_Measure", target), ("Region", region)]); + let val = m + .formula_cache + .get(&key) + .and_then(|v| v.as_f64()) + .unwrap_or_else(|| panic!("missing cache entry for {target}/{region}")); + assert!( + approx_eq(val, expected), + "{target}/{region}: expected {expected}, got {val}" + ); + } + + // Recomputing again reproduces the exact same cache (determinism). + let snapshot = m.formula_cache.clone(); + m.recompute_formulas(&[]); + assert_eq!(m.formula_cache, snapshot); + } } #[cfg(test)]