refactor(core): use epsilon for float comparison and IndexSet for stem collection

Use `FLOAT_EQ_EPSILON` for equality/inequality operators and division-by-zero guards in formula evaluation to ensure consistent semantics. Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas` to improve performance from O(n²) to O(n). Add regression tests for epsilon-based comparison and stem collection performance. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
2026-06-09 21:43:13 -07:00
parent 0fe04de53e
commit 45bfe2c4c7
1 changed files with 127 additions and 12 deletions
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 use anyhow::{Result, anyhow};
-use indexmap::IndexMap;
+use indexmap::{IndexMap, IndexSet};
 use serde::{Deserialize, Serialize};
 use super::category::{Category, CategoryId};
@@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula};
 const MAX_CATEGORIES: usize = 12;
 /// Tolerance for float comparison in formula evaluation. The `=`/`!=`
 /// operators and the division-by-zero guard share this epsilon so the
 /// semantics agree: any value that `x = 0` treats as zero is also rejected
 /// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the
 /// else branch's division with an x its own condition called zero.
 /// 1e-10 is far below typical data magnitudes but absorbs accumulated
 /// f64 rounding noise from chained formulas.
 const FLOAT_EQ_EPSILON: f64 = 1e-10;
 /// Pure-data document model: categories, cells, and formulas.
 ///
 /// `Model` intentionally does **not** know about views. The view axes and
@@ -342,8 +351,9 @@ impl Model {
            .map(|f| (f.target_category.clone(), f.target.clone()))
            .collect();
-        // Gather all unique partial keys (stems) for each formula category
+        // Gather all unique partial keys (stems) for each formula category.
-        let mut stems: Vec<CellKey> = Vec::new();
+        // IndexSet dedupes in O(1) per cell while preserving insertion order.
        let mut stems: IndexSet<CellKey> = IndexSet::new();
        for (target_cat, _) in &formula_cats {
            for (key, _) in self.data.iter_cells() {
                let stem = key.without(target_cat);
@@ -353,9 +363,7 @@ impl Model {
                for nc in none_cats {
                    stripped = stripped.without(nc);
                }
-                if !stems.contains(&stripped) {
+                stems.insert(stripped);
                    stems.push(stripped);
                }
            }
        }
@@ -470,7 +478,7 @@ impl Model {
                        BinOp::Sub => Ok(lv - rv),
                        BinOp::Mul => Ok(lv * rv),
                        BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                Err("div/0".into())
                            } else {
                                Ok(lv / rv)
@@ -552,8 +560,8 @@ impl Model {
                    let lv = eval_expr_cached(l, context, model, target_category, none_cats)?;
                    let rv = eval_expr_cached(r, context, model, target_category, none_cats)?;
                    match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                        BinOp::Lt => Ok(lv < rv),
                        BinOp::Gt => Ok(lv > rv),
                        BinOp::Le => Ok(lv <= rv),
@@ -671,7 +679,7 @@ impl Model {
                        BinOp::Sub => Ok(lv - rv),
                        BinOp::Mul => Ok(lv * rv),
                        BinOp::Div => {
-                            if rv == 0.0 {
+                            if rv.abs() < FLOAT_EQ_EPSILON {
                                Err("div/0".into())
                            } else {
                                Ok(lv / rv)
@@ -746,8 +754,8 @@ impl Model {
                    let lv = eval_expr(l, context, model, target_category, depth)?;
                    let rv = eval_expr(r, context, model, target_category, depth)?;
                    match op {
-                        BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
+                        BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
-                        BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
+                        BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
                        BinOp::Lt => Ok(lv < rv),
                        BinOp::Gt => Ok(lv > rv),
                        BinOp::Le => Ok(lv <= rv),
@@ -1534,6 +1542,113 @@ mod formula_tests {
            Some(CellValue::Number(2.0))
        );
    }
    /// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the
    /// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11
    /// the comparison `X = 0` is true, yet `100 / X` happily divided —
    /// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the
    /// equality operator treats as zero must also trigger div/0.
    /// This exercises the direct eval path (`eval_expr` in
    /// `eval_formula_depth`).
    #[test]
    fn near_zero_divisor_consistent_with_equality_epsilon() {
        let mut m = Model::new("Test");
        m.add_category("_Measure").unwrap();
        if let Some(cat) = m.category_mut("_Measure") {
            cat.add_item("X");
            cat.add_item("IsZero");
            cat.add_item("Ratio");
        }
        m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11));
        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
        // Equality says X is zero…
        assert_eq!(
            m.evaluate(&coord(&[("_Measure", "IsZero")])),
            Some(CellValue::Number(1.0))
        );
        // …so dividing by X must be div/0, not 1e13.
        // Bug: returns Number(1e13) — guard uses exact == 0.0.
        assert_eq!(
            m.evaluate(&coord(&[("_Measure", "Ratio")])),
            Some(CellValue::Error("div/0".into()))
        );
    }
    /// Same bug (improvise-0bf), fixed-point/cached eval path
    /// (`eval_expr_cached` in `eval_formula_with_cache`): its division
    /// guard also used exact `== 0.0` while `=` used the epsilon.
    #[test]
    fn near_zero_divisor_consistent_in_cached_eval_path() {
        let mut m = Model::new("Test");
        m.add_category("_Measure").unwrap();
        m.add_category("Region").unwrap();
        if let Some(cat) = m.category_mut("_Measure") {
            cat.add_item("X");
            cat.add_item("IsZero");
            cat.add_item("Ratio");
        }
        if let Some(cat) = m.category_mut("Region") {
            cat.add_item("East");
        }
        m.set_cell(
            coord(&[("_Measure", "X"), ("Region", "East")]),
            CellValue::Number(1e-11),
        );
        m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
        m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
        let none_cats = vec!["Region".to_string()];
        m.recompute_formulas(&none_cats);
        // Equality says X is zero…
        assert_eq!(
            m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats),
            Some(CellValue::Number(1.0))
        );
        // …so dividing by X must be div/0 in the cached path too.
        assert_eq!(
            m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats),
            Some(CellValue::Error("div/0".into()))
        );
    }
    /// Regression guard for improvise-6os: stem collection in
    /// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over
    /// all data cells per formula category). Replaced with an `IndexSet` —
    /// a pure perf change. This pins the observable contract: recomputed
    /// values for a multi-stem model (chained formulas, duplicate stems
    /// from multiple measures per stem) are unchanged and stable across
    /// repeated recomputes.
    #[test]
    fn recompute_formulas_multi_stem_values_stable() {
        let mut m = revenue_cost_model();
        m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap());
        m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap());
        m.recompute_formulas(&[]);
        let cases = [
            (("Profit", "East"), 400.0),
            (("Profit", "West"), 300.0),
            (("Margin", "East"), 0.4),
            (("Margin", "West"), 0.375),
        ];
        for ((target, region), expected) in cases {
            let key = coord(&[("_Measure", target), ("Region", region)]);
            let val = m
                .formula_cache
                .get(&key)
                .and_then(|v| v.as_f64())
                .unwrap_or_else(|| panic!("missing cache entry for {target}/{region}"));
            assert!(
                approx_eq(val, expected),
                "{target}/{region}: expected {expected}, got {val}"
            );
        }
        // Recomputing again reproduces the exact same cache (determinism).
        let snapshot = m.formula_cache.clone();
        m.recompute_formulas(&[]);
        assert_eq!(m.formula_cache, snapshot);
    }
 }
 #[cfg(test)]