refactor(core): use epsilon for float comparison and IndexSet for stem collection
Use `FLOAT_EQ_EPSILON` for equality/inequality operators and division-by-zero guards in formula evaluation to ensure consistent semantics. Replace `Vec` with `IndexSet` for stem collection in `recompute_formulas` to improve performance from O(n²) to O(n). Add regression tests for epsilon-based comparison and stem collection performance. Co-Authored-By: fiddlerwoaroof/git-smart-commit (unsloth/gemma-4-26B-A4B-it-GGUF:UD-Q5_K_XL)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use indexmap::IndexMap;
|
||||
use indexmap::{IndexMap, IndexSet};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::category::{Category, CategoryId};
|
||||
@@ -10,6 +10,15 @@ use crate::formula::{AggFunc, Formula};
|
||||
|
||||
const MAX_CATEGORIES: usize = 12;
|
||||
|
||||
/// Tolerance for float comparison in formula evaluation. The `=`/`!=`
|
||||
/// operators and the division-by-zero guard share this epsilon so the
|
||||
/// semantics agree: any value that `x = 0` treats as zero is also rejected
|
||||
/// as a divisor (`div/0`). Otherwise `IF(x = 0, a, y / x)` could take the
|
||||
/// else branch's division with an x its own condition called zero.
|
||||
/// 1e-10 is far below typical data magnitudes but absorbs accumulated
|
||||
/// f64 rounding noise from chained formulas.
|
||||
const FLOAT_EQ_EPSILON: f64 = 1e-10;
|
||||
|
||||
/// Pure-data document model: categories, cells, and formulas.
|
||||
///
|
||||
/// `Model` intentionally does **not** know about views. The view axes and
|
||||
@@ -342,8 +351,9 @@ impl Model {
|
||||
.map(|f| (f.target_category.clone(), f.target.clone()))
|
||||
.collect();
|
||||
|
||||
// Gather all unique partial keys (stems) for each formula category
|
||||
let mut stems: Vec<CellKey> = Vec::new();
|
||||
// Gather all unique partial keys (stems) for each formula category.
|
||||
// IndexSet dedupes in O(1) per cell while preserving insertion order.
|
||||
let mut stems: IndexSet<CellKey> = IndexSet::new();
|
||||
for (target_cat, _) in &formula_cats {
|
||||
for (key, _) in self.data.iter_cells() {
|
||||
let stem = key.without(target_cat);
|
||||
@@ -353,9 +363,7 @@ impl Model {
|
||||
for nc in none_cats {
|
||||
stripped = stripped.without(nc);
|
||||
}
|
||||
if !stems.contains(&stripped) {
|
||||
stems.push(stripped);
|
||||
}
|
||||
stems.insert(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -470,7 +478,7 @@ impl Model {
|
||||
BinOp::Sub => Ok(lv - rv),
|
||||
BinOp::Mul => Ok(lv * rv),
|
||||
BinOp::Div => {
|
||||
if rv == 0.0 {
|
||||
if rv.abs() < FLOAT_EQ_EPSILON {
|
||||
Err("div/0".into())
|
||||
} else {
|
||||
Ok(lv / rv)
|
||||
@@ -552,8 +560,8 @@ impl Model {
|
||||
let lv = eval_expr_cached(l, context, model, target_category, none_cats)?;
|
||||
let rv = eval_expr_cached(r, context, model, target_category, none_cats)?;
|
||||
match op {
|
||||
BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
|
||||
BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
|
||||
BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
|
||||
BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
|
||||
BinOp::Lt => Ok(lv < rv),
|
||||
BinOp::Gt => Ok(lv > rv),
|
||||
BinOp::Le => Ok(lv <= rv),
|
||||
@@ -671,7 +679,7 @@ impl Model {
|
||||
BinOp::Sub => Ok(lv - rv),
|
||||
BinOp::Mul => Ok(lv * rv),
|
||||
BinOp::Div => {
|
||||
if rv == 0.0 {
|
||||
if rv.abs() < FLOAT_EQ_EPSILON {
|
||||
Err("div/0".into())
|
||||
} else {
|
||||
Ok(lv / rv)
|
||||
@@ -746,8 +754,8 @@ impl Model {
|
||||
let lv = eval_expr(l, context, model, target_category, depth)?;
|
||||
let rv = eval_expr(r, context, model, target_category, depth)?;
|
||||
match op {
|
||||
BinOp::Eq => Ok((lv - rv).abs() < 1e-10),
|
||||
BinOp::Ne => Ok((lv - rv).abs() >= 1e-10),
|
||||
BinOp::Eq => Ok((lv - rv).abs() < FLOAT_EQ_EPSILON),
|
||||
BinOp::Ne => Ok((lv - rv).abs() >= FLOAT_EQ_EPSILON),
|
||||
BinOp::Lt => Ok(lv < rv),
|
||||
BinOp::Gt => Ok(lv > rv),
|
||||
BinOp::Le => Ok(lv <= rv),
|
||||
@@ -1534,6 +1542,113 @@ mod formula_tests {
|
||||
Some(CellValue::Number(2.0))
|
||||
);
|
||||
}
|
||||
|
||||
/// Bug (improvise-0bf): `=`/`!=` compare with a 1e-10 epsilon, but the
|
||||
/// division-by-zero guard checked `rv == 0.0` exactly. With X = 1e-11
|
||||
/// the comparison `X = 0` is true, yet `100 / X` happily divided —
|
||||
/// incoherent semantics for `IF(X = 0, a, y / X)`. Any value the
|
||||
/// equality operator treats as zero must also trigger div/0.
|
||||
/// This exercises the direct eval path (`eval_expr` in
|
||||
/// `eval_formula_depth`).
|
||||
#[test]
|
||||
fn near_zero_divisor_consistent_with_equality_epsilon() {
|
||||
let mut m = Model::new("Test");
|
||||
m.add_category("_Measure").unwrap();
|
||||
if let Some(cat) = m.category_mut("_Measure") {
|
||||
cat.add_item("X");
|
||||
cat.add_item("IsZero");
|
||||
cat.add_item("Ratio");
|
||||
}
|
||||
m.set_cell(coord(&[("_Measure", "X")]), CellValue::Number(1e-11));
|
||||
m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
|
||||
m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
|
||||
// Equality says X is zero…
|
||||
assert_eq!(
|
||||
m.evaluate(&coord(&[("_Measure", "IsZero")])),
|
||||
Some(CellValue::Number(1.0))
|
||||
);
|
||||
// …so dividing by X must be div/0, not 1e13.
|
||||
// Bug: returns Number(1e13) — guard uses exact == 0.0.
|
||||
assert_eq!(
|
||||
m.evaluate(&coord(&[("_Measure", "Ratio")])),
|
||||
Some(CellValue::Error("div/0".into()))
|
||||
);
|
||||
}
|
||||
|
||||
/// Same bug (improvise-0bf), fixed-point/cached eval path
|
||||
/// (`eval_expr_cached` in `eval_formula_with_cache`): its division
|
||||
/// guard also used exact `== 0.0` while `=` used the epsilon.
|
||||
#[test]
|
||||
fn near_zero_divisor_consistent_in_cached_eval_path() {
|
||||
let mut m = Model::new("Test");
|
||||
m.add_category("_Measure").unwrap();
|
||||
m.add_category("Region").unwrap();
|
||||
if let Some(cat) = m.category_mut("_Measure") {
|
||||
cat.add_item("X");
|
||||
cat.add_item("IsZero");
|
||||
cat.add_item("Ratio");
|
||||
}
|
||||
if let Some(cat) = m.category_mut("Region") {
|
||||
cat.add_item("East");
|
||||
}
|
||||
m.set_cell(
|
||||
coord(&[("_Measure", "X"), ("Region", "East")]),
|
||||
CellValue::Number(1e-11),
|
||||
);
|
||||
m.add_formula(parse_formula("IsZero = IF(X = 0, 1, 0)", "_Measure").unwrap());
|
||||
m.add_formula(parse_formula("Ratio = 100 / X", "_Measure").unwrap());
|
||||
let none_cats = vec!["Region".to_string()];
|
||||
m.recompute_formulas(&none_cats);
|
||||
// Equality says X is zero…
|
||||
assert_eq!(
|
||||
m.evaluate_aggregated(&coord(&[("_Measure", "IsZero")]), &none_cats),
|
||||
Some(CellValue::Number(1.0))
|
||||
);
|
||||
// …so dividing by X must be div/0 in the cached path too.
|
||||
assert_eq!(
|
||||
m.evaluate_aggregated(&coord(&[("_Measure", "Ratio")]), &none_cats),
|
||||
Some(CellValue::Error("div/0".into()))
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression guard for improvise-6os: stem collection in
|
||||
/// `recompute_formulas` deduplicated with `Vec::contains` (O(n²) over
|
||||
/// all data cells per formula category). Replaced with an `IndexSet` —
|
||||
/// a pure perf change. This pins the observable contract: recomputed
|
||||
/// values for a multi-stem model (chained formulas, duplicate stems
|
||||
/// from multiple measures per stem) are unchanged and stable across
|
||||
/// repeated recomputes.
|
||||
#[test]
|
||||
fn recompute_formulas_multi_stem_values_stable() {
|
||||
let mut m = revenue_cost_model();
|
||||
m.add_formula(parse_formula("Profit = Revenue - Cost", "_Measure").unwrap());
|
||||
m.add_formula(parse_formula("Margin = Profit / Revenue", "_Measure").unwrap());
|
||||
m.recompute_formulas(&[]);
|
||||
|
||||
let cases = [
|
||||
(("Profit", "East"), 400.0),
|
||||
(("Profit", "West"), 300.0),
|
||||
(("Margin", "East"), 0.4),
|
||||
(("Margin", "West"), 0.375),
|
||||
];
|
||||
for ((target, region), expected) in cases {
|
||||
let key = coord(&[("_Measure", target), ("Region", region)]);
|
||||
let val = m
|
||||
.formula_cache
|
||||
.get(&key)
|
||||
.and_then(|v| v.as_f64())
|
||||
.unwrap_or_else(|| panic!("missing cache entry for {target}/{region}"));
|
||||
assert!(
|
||||
approx_eq(val, expected),
|
||||
"{target}/{region}: expected {expected}, got {val}"
|
||||
);
|
||||
}
|
||||
|
||||
// Recomputing again reproduces the exact same cache (determinism).
|
||||
let snapshot = m.formula_cache.clone();
|
||||
m.recompute_formulas(&[]);
|
||||
assert_eq!(m.formula_cache, snapshot);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user