From 45bfe2c4c76dd714dcbe034fc89ebc90f128c6f1 Mon Sep 17 00:00:00 2001
From: Edward Langley <el-github@elangley.org>
Date: Tue, 9 Jun 2026 21:43:13 -0700
Subject: [PATCH] refactor(core): use epsilon for float comparison and IndexSet
 for stem collection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use `FLOAT_EQ_EPSILON` for equality/inequality operators and
division-by-zero guards in formula evaluation to ensure consistent
semantics.

Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas`
to improve performance from O(n²) to O(n).

Add regression tests for epsilon-based comparison and stem collection
performance.

Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
---
 crates/improvise-core/src/model/types.rs | 139 +++++++++++++++++++++--
 1 file changed, 127 insertions(+), 12 deletions(-)
diff --git a/crates/improvise-core/src/model/types.rs b/crates/improvise-core/src/model/types.rs
index cfc7b69..a6a6c61 100644
--- a/crates/improvise-core/src/model/types.rs
+++ b/crates/improvise-core/src/model/types.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 
 use anyhow::{Result, anyhow};
-use indexmap::IndexMap;
+use indexmap::{IndexMap, IndexSet};
 use serde::{Deserialize, Serialize};
 
 use super::category::{Category, CategoryId};
@@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula};
 
 const MAX_CATEGORIES: usize = 12;
 
+/// Tolerance for float comparison in formula evaluation. The `=`/`!=`
+/// operators and the division-by-zero guard share this epsilon so the
+/// semantics agree: any value that `x = 0` treats as zero is also rejected
+/// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the
+/// else branch's division with an x its own condition called zero.
+/// 1e-10 is far below typical data magnitudes but absorbs accumulated
+/// f64 rounding noise from chained formulas.
+const FLOAT_EQ_EPSILON: f64 = 1e-10;
+
 /// Pure-data document model: categories, cells, and formulas.
 ///
 /// `Model` intentionally does **not** know about views. The view axes and
@@ -342,8 +351,9 @@ impl Model {
             .map(|f| (f.target_category.clone(), f.target.clone()))
             .collect();
 
-        // Gather all unique partial keys (stems) for each formula category
-        let mut stems: Vec<CellKey> = Vec::new();
+        // Gather all unique partial keys (stems) for each formula category.
+        // IndexSet dedupes in O(1) per cell while preserving insertion order.
+        let mut stems: IndexSet<CellKey> = IndexSet::new();
         for (target_cat, _) in &formula_cats {
             for (key, _) in self.data.iter_cells() {
                 let stem = key.without(target_cat);
@@ -353,9 +363,7 @@ impl Model {
                 for nc in none_cats {
                     stripped = stripped.without(nc);
                 }
-                if !stems.contains(&stripped) {
-                    stems.push(stripped);
-                }
+                stems.insert(stripped);
             }
         }
 
@@ -470,7 +478,7 @@ impl Model {
                         BinOp::Sub => Ok(lv - rv),
                         BinOp::Mul => Ok(lv * rv),
                         BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                 Err("div/0".into())
                             } else {
                                 Ok(lv / rv)
@@ -552,8 +560,8 @@ impl Model {
                     let lv = eval_expr_cached(l, context, model, target_category, none_cats)?;
                     let rv = eval_expr_cached(r, context, model, target_category, none_cats)?;
                     match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                         BinOp::Lt => Ok(lv < rv),
                         BinOp::Gt => Ok(lv > rv),
                         BinOp::Le => Ok(lv <= rv),
@@ -671,7 +679,7 @@ impl Model {
                         BinOp::Sub => Ok(lv - rv),
                         BinOp::Mul => Ok(lv * rv),
                         BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                 Err("div/0".into())
                             } else {
                                 Ok(lv / rv)
@@ -746,8 +754,8 @@ impl Model {
                     let lv = eval_expr(l, context, model, target_category, depth)?;
                     let rv = eval_expr(r, context, model, target_category, depth)?;
                     match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                         BinOp::Lt => Ok(lv < rv),
                         BinOp::Gt => Ok(lv > rv),
                         BinOp::Le => Ok(lv <= rv),
@@ -1534,6 +1542,113 @@ mod formula_tests {
             Some(CellValue::Number(2.0))
         );
     }
+
+    /// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the
+    /// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11
+    /// the comparison `X = 0` is true, yet `100 / X` happily divided —
+    /// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the
+    /// equality operator treats as zero must also trigger div/0.
+    /// This exercises the direct eval path (`eval_expr` in
+    /// `eval_formula_depth`).
+    #[test]
+    fn near_zero_divisor_consistent_with_equality_epsilon() {
+        let mut m = Model::new("Test");
+        m.add_category("_Measure").unwrap();
+        if let Some(cat) = m.category_mut("_Measure") {
+            cat.add_item("X");
+            cat.add_item("IsZero");
+            cat.add_item("Ratio");
+        }
+        m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11));
+        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
+        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
+        // Equality says X is zero…
+        assert_eq!(
+            m.evaluate(&coord(&[("_Measure", "IsZero")])),
+            Some(CellValue::Number(1.0))
+        );
+        // …so dividing by X must be div/0, not 1e13.
+        // Bug: returns Number(1e13) — guard uses exact == 0.0.
+        assert_eq!(
+            m.evaluate(&coord(&[("_Measure", "Ratio")])),
+            Some(CellValue::Error("div/0".into()))
+        );
+    }
+
+    /// Same bug (improvise-0bf), fixed-point/cached eval path
+    /// (`eval_expr_cached` in `eval_formula_with_cache`): its division
+    /// guard also used exact `== 0.0` while `=` used the epsilon.
+    #[test]
+    fn near_zero_divisor_consistent_in_cached_eval_path() {
+        let mut m = Model::new("Test");
+        m.add_category("_Measure").unwrap();
+        m.add_category("Region").unwrap();
+        if let Some(cat) = m.category_mut("_Measure") {
+            cat.add_item("X");
+            cat.add_item("IsZero");
+            cat.add_item("Ratio");
+        }
+        if let Some(cat) = m.category_mut("Region") {
+            cat.add_item("East");
+        }
+        m.set_cell(
+            coord(&[("_Measure", "X"), ("Region", "East")]),
+            CellValue::Number(1e-11),
+        );
+        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
+        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
+        let none_cats = vec!["Region".to_string()];
+        m.recompute_formulas(&none_cats);
+        // Equality says X is zero…
+        assert_eq!(
+            m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats),
+            Some(CellValue::Number(1.0))
+        );
+        // …so dividing by X must be div/0 in the cached path too.
+        assert_eq!(
+            m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats),
+            Some(CellValue::Error("div/0".into()))
+        );
+    }
+
+    /// Regression guard for improvise-6os: stem collection in
+    /// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over
+    /// all data cells per formula category). Replaced with an `IndexSet` —
+    /// a pure perf change. This pins the observable contract: recomputed
+    /// values for a multi-stem model (chained formulas, duplicate stems
+    /// from multiple measures per stem) are unchanged and stable across
+    /// repeated recomputes.
+    #[test]
+    fn recompute_formulas_multi_stem_values_stable() {
+        let mut m = revenue_cost_model();
+        m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap());
+        m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap());
+        m.recompute_formulas(&[]);
+
+        let cases = [
+            (("Profit", "East"), 400.0),
+            (("Profit", "West"), 300.0),
+            (("Margin", "East"), 0.4),
+            (("Margin", "West"), 0.375),
+        ];
+        for ((target, region), expected) in cases {
+            let key = coord(&[("_Measure", target), ("Region", region)]);
+            let val = m
+                .formula_cache
+                .get(&key)
+                .and_then(|v| v.as_f64())
+                .unwrap_or_else(|| panic!("missing cache entry for {target}/{region}"));
+            assert!(
+                approx_eq(val, expected),
+                "{target}/{region}: expected {expected}, got {val}"
+            );
+        }
+
+        // Recomputing again reproduces the exact same cache (determinism).
+        let snapshot = m.formula_cache.clone();
+        m.recompute_formulas(&[]);
+        assert_eq!(m.formula_cache, snapshot);
+    }
 }
 
 #[cfg(test)]