refactor(core): use epsilon for float comparison and IndexSet for stem collection

Use `FLOAT_EQ_EPSILON` for equality/inequality operators and division-by-zero guards in formula evaluation to ensure consistent semantics. Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas` to improve performance from O(n²) to O(n). Add regression tests for epsilon-based comparison and stem collection performance. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
2026-06-09 21:43:13 -07:00
parent 0fe04de53e
commit 45bfe2c4c7
1 changed files with 127 additions and 12 deletions
@@ -1,7 +1,7 @@
 use std::collections::HashMap;

 use anyhow::{Result, anyhow};
-use indexmap::IndexMap;
+use indexmap::{IndexMap, IndexSet};
 use serde::{Deserialize, Serialize};

 use super::category::{Category, CategoryId};
@@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula};

 const MAX_CATEGORIES: usize = 12;

+/// Tolerance for float comparison in formula evaluation. The `=`/`!=`
+/// operators and the division-by-zero guard share this epsilon so the
+/// semantics agree: any value that `x = 0` treats as zero is also rejected
+/// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the
+/// else branch's division with an x its own condition called zero.
+/// 1e-10 is far below typical data magnitudes but absorbs accumulated
+/// f64 rounding noise from chained formulas.
+const FLOAT_EQ_EPSILON: f64 = 1e-10;
+
 /// Pure-data document model: categories, cells, and formulas.
 ///
 /// `Model` intentionally does **not** know about views. The view axes and
@@ -342,8 +351,9 @@ impl Model {
            .map(|f| (f.target_category.clone(), f.target.clone()))
            .collect();

-        // Gather all unique partial keys (stems) for each formula category
-        let mut stems: Vec<CellKey> = Vec::new();
+        // Gather all unique partial keys (stems) for each formula category.
+        // IndexSet dedupes in O(1) per cell while preserving insertion order.
+        let mut stems: IndexSet<CellKey> = IndexSet::new();
        for (target_cat, _) in &formula_cats {
            for (key, _) in self.data.iter_cells() {
                let stem = key.without(target_cat);
@@ -353,9 +363,7 @@ impl Model {
                for nc in none_cats {
                    stripped = stripped.without(nc);
                }
-                if !stems.contains(&stripped) {
-                    stems.push(stripped);
-                }
+                stems.insert(stripped);
            }
        }

@@ -470,7 +478,7 @@ impl Model {
                        BinOp::Sub => Ok(lv - rv),
                        BinOp::Mul => Ok(lv * rv),
                        BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                Err("div/0".into())
                            } else {
                                Ok(lv / rv)
@@ -552,8 +560,8 @@ impl Model {
                    let lv = eval_expr_cached(l, context, model, target_category, none_cats)?;
                    let rv = eval_expr_cached(r, context, model, target_category, none_cats)?;
                    match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                        BinOp::Lt => Ok(lv < rv),
                        BinOp::Gt => Ok(lv > rv),
                        BinOp::Le => Ok(lv <= rv),
@@ -671,7 +679,7 @@ impl Model {
                        BinOp::Sub => Ok(lv - rv),
                        BinOp::Mul => Ok(lv * rv),
                        BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                Err("div/0".into())
                            } else {
                                Ok(lv / rv)
@@ -746,8 +754,8 @@ impl Model {
                    let lv = eval_expr(l, context, model, target_category, depth)?;
                    let rv = eval_expr(r, context, model, target_category, depth)?;
                    match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                        BinOp::Lt => Ok(lv < rv),
                        BinOp::Gt => Ok(lv > rv),
                        BinOp::Le => Ok(lv <= rv),
@@ -1534,6 +1542,113 @@ mod formula_tests {
            Some(CellValue::Number(2.0))
        );
    }
+
+    /// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the
+    /// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11
+    /// the comparison `X = 0` is true, yet `100 / X` happily divided —
+    /// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the
+    /// equality operator treats as zero must also trigger div/0.
+    /// This exercises the direct eval path (`eval_expr` in
+    /// `eval_formula_depth`).
+    #[test]
+    fn near_zero_divisor_consistent_with_equality_epsilon() {
+        let mut m = Model::new("Test");
+        m.add_category("_Measure").unwrap();
+        if let Some(cat) = m.category_mut("_Measure") {
+            cat.add_item("X");
+            cat.add_item("IsZero");
+            cat.add_item("Ratio");
+        }
+        m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11));
+        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
+        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
+        // Equality says X is zero…
+        assert_eq!(
+            m.evaluate(&coord(&[("_Measure", "IsZero")])),
+            Some(CellValue::Number(1.0))
+        );
+        // …so dividing by X must be div/0, not 1e13.
+        // Bug: returns Number(1e13) — guard uses exact == 0.0.
+        assert_eq!(
+            m.evaluate(&coord(&[("_Measure", "Ratio")])),
+            Some(CellValue::Error("div/0".into()))
+        );
+    }
+
+    /// Same bug (improvise-0bf), fixed-point/cached eval path
+    /// (`eval_expr_cached` in `eval_formula_with_cache`): its division
+    /// guard also used exact `== 0.0` while `=` used the epsilon.
+    #[test]
+    fn near_zero_divisor_consistent_in_cached_eval_path() {
+        let mut m = Model::new("Test");
+        m.add_category("_Measure").unwrap();
+        m.add_category("Region").unwrap();
+        if let Some(cat) = m.category_mut("_Measure") {
+            cat.add_item("X");
+            cat.add_item("IsZero");
+            cat.add_item("Ratio");
+        }
+        if let Some(cat) = m.category_mut("Region") {
+            cat.add_item("East");
+        }
+        m.set_cell(
+            coord(&[("_Measure", "X"), ("Region", "East")]),
+            CellValue::Number(1e-11),
+        );
+        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
+        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
+        let none_cats = vec!["Region".to_string()];
+        m.recompute_formulas(&none_cats);
+        // Equality says X is zero…
+        assert_eq!(
+            m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats),
+            Some(CellValue::Number(1.0))
+        );
+        // …so dividing by X must be div/0 in the cached path too.
+        assert_eq!(
+            m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats),
+            Some(CellValue::Error("div/0".into()))
+        );
+    }
+
+    /// Regression guard for improvise-6os: stem collection in
+    /// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over
+    /// all data cells per formula category). Replaced with an `IndexSet` —
+    /// a pure perf change. This pins the observable contract: recomputed
+    /// values for a multi-stem model (chained formulas, duplicate stems
+    /// from multiple measures per stem) are unchanged and stable across
+    /// repeated recomputes.
+    #[test]
+    fn recompute_formulas_multi_stem_values_stable() {
+        let mut m = revenue_cost_model();
+        m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap());
+        m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap());
+        m.recompute_formulas(&[]);
+
+        let cases = [
+            (("Profit", "East"), 400.0),
+            (("Profit", "West"), 300.0),
+            (("Margin", "East"), 0.4),
+            (("Margin", "West"), 0.375),
+        ];
+        for ((target, region), expected) in cases {
+            let key = coord(&[("_Measure", target), ("Region", region)]);
+            let val = m
+                .formula_cache
+                .get(&key)
+                .and_then(|v| v.as_f64())
+                .unwrap_or_else(|| panic!("missing cache entry for {target}/{region}"));
+            assert!(
+                approx_eq(val, expected),
+                "{target}/{region}: expected {expected}, got {val}"
+            );
+        }
+
+        // Recomputing again reproduces the exact same cache (determinism).
+        let snapshot = m.formula_cache.clone();
+        m.recompute_formulas(&[]);
+        assert_eq!(m.formula_cache, snapshot);
+    }
 }

 #[cfg(test)]