skip hard_bounds that can't exclude any value

When a histogram's hard_bounds are wider than the column's value range, the per-doc `bounds.contains` check can never fail. Collapse such bounds to the unbounded sentinel in `normalize_histogram_req`, so both the general histogram hot loop and the fused term×histogram path skip the check — the latter then derives per-term counts from the grid (the ~17% win) instead of falling back to per-doc counting just because `bounds != [MIN, MAX]`. Only the collect-time filter is affected: empty-bucket emission reads `req.hard_bounds` directly, and hard_bounds only ever clips that range, so a wider-than-data bound leaves results unchanged. Covered by new tests on the general and fused paths, including mid-interval (bucket-splitting) bounds. Also tighten the fused-path u32-overflow guard to bound on `num_vals()` (the per-value increment count) rather than `num_docs()`, and document why the fused collector's hot-loop fields are hoisted into locals (re-reading them from memory each iteration measured ~15% slower).
2026-06-19 09:16:45 +00:00 · 2026-06-11 18:17:04 +02:00 · 2026-06-11 18:17:04 +02:00 · ef13489d63
commit ef13489d63
parent 9f7aea4765
2 changed files with 136 additions and 10 deletions
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@ -682,6 +682,22 @@ fn normalize_histogram_req(req_data: &mut HistogramAggReqData) -> crate::Result<
        max: f64::MAX,
    });
    req_data.offset = req_data.req.offset.unwrap_or(0.0);
+    // Drop `hard_bounds` that can't exclude any value (the column's range already sits inside
+    // them): the per-doc `bounds.contains` check is then a no-op, so collapsing to the unbounded
+    // sentinel lets the histogram hot loop skip it and the fused term×histogram path derive
+    // per-term counts from the grid. Only this collect-time filter is touched — empty-bucket
+    // emission reads `req.hard_bounds` directly (see `get_req_min_max`), and `hard_bounds` only
+    // ever clips that range, so a wider-than-data bound leaves the result unchanged.
+    if req_data.req.hard_bounds.is_some() {
+        let col_min = f64_from_fastfield_u64(req_data.accessor.min_value(), req_data.field_type);
+        let col_max = f64_from_fastfield_u64(req_data.accessor.max_value(), req_data.field_type);
+        if col_min >= req_data.bounds.min && col_max <= req_data.bounds.max {
+            req_data.bounds = HistogramBounds {
+                min: f64::MIN,
+                max: f64::MAX,
+            };
+        }
+    }
    Ok(())
 }

@ -1409,6 +1425,55 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn histogram_non_binding_hard_bounds_test_multi_segment() -> crate::Result<()> {
+        histogram_non_binding_hard_bounds_test_with_opt(false)
+    }
+    #[test]
+    fn histogram_non_binding_hard_bounds_test_single_segment() -> crate::Result<()> {
+        histogram_non_binding_hard_bounds_test_with_opt(true)
+    }
+    /// `hard_bounds` wider than the data (here with mid-interval edges, to cover the "bound cuts a
+    /// bucket" case) can't exclude any value, so the result must be identical to the same request
+    /// without bounds. Guards the normalization that collapses such bounds to the unbounded
+    /// sentinel so the hot loop / fused path can skip the per-doc bounds check.
+    fn histogram_non_binding_hard_bounds_test_with_opt(merge_segments: bool) -> crate::Result<()> {
+        let values = vec![10.0, 12.0, 14.0, 16.0, 10.0, 13.0, 10.0, 12.0];
+        let index = get_test_index_from_values(merge_segments, &values)?;
+
+        // Mid-interval edges, but wider than the data range [10, 16] -> they exclude nothing.
+        let with_bounds: Aggregations = serde_json::from_value(json!({
+            "histogram": {
+                "histogram": {
+                    "field": "score_f64",
+                    "interval": 1.0,
+                    "hard_bounds": { "min": 9.5, "max": 16.5 }
+                }
+            }
+        }))
+        .unwrap();
+        let no_bounds: Aggregations = serde_json::from_value(json!({
+            "histogram": {
+                "histogram": { "field": "score_f64", "interval": 1.0 }
+            }
+        }))
+        .unwrap();
+
+        let res_bounds = exec_request(with_bounds, &index)?;
+        let res_plain = exec_request(no_bounds, &index)?;
+        // Dropping a non-binding bound must not change anything.
+        assert_eq!(res_bounds, res_plain);
+
+        // Sanity: buckets span the data range with gaps filled (min_doc_count defaults to 0).
+        assert_eq!(res_bounds["histogram"]["buckets"][0]["key"], 10.0);
+        assert_eq!(res_bounds["histogram"]["buckets"][0]["doc_count"], 3);
+        assert_eq!(res_bounds["histogram"]["buckets"][6]["key"], 16.0);
+        assert_eq!(res_bounds["histogram"]["buckets"][6]["doc_count"], 1);
+        assert_eq!(res_bounds["histogram"]["buckets"][7], Value::Null);
+
+        Ok(())
+    }
+
    #[test]
    fn histogram_empty_result_behaviour_test_single_segment() -> crate::Result<()> {
        histogram_empty_result_behaviour_test_with_opt(true)
--- a/src/aggregation/bucket/term_agg/term_histogram.rs
+++ b/src/aggregation/bucket/term_agg/term_histogram.rs
@ -22,7 +22,9 @@ use crate::aggregation::{f64_from_fastfield_u64, BucketId};
 /// Maximum number of cells (`num_terms × num_time_buckets`) in the fused flat 2D grid. Above this
 /// the grid would be too large/cache-unfriendly, so we fall back to the general buffered path.
 /// `1 << 14` cells = 128 KB of `u64` counters, comfortably L2-resident.
-const MAX_FUSED_GRID_BUCKETS: usize = 1 << 14;
+///
+/// Since we are only at the top-level, this won't be multiplied by any parent buckets.
+const MAX_FUSED_GRID_BUCKETS: usize = 16384;

 /// Fused collector for `terms` (low cardinality) × a single `histogram`/`date_histogram` leaf with
 /// nothing nested below it, when the resulting `num_terms × num_time_buckets` grid is small (see
@ -87,10 +89,11 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
        if self.derive_term_counts_from_histogram {
            // `collect` skipped the per-doc total; recover it as the grid row-sum (every doc landed
            // in a bucket, since there are no hard bounds).
-            let nb = self.num_time_buckets;
-            let counts = &self.counts;
-            for (t, slot) in self.term_counts.iter_mut().enumerate() {
-                *slot = counts[t * nb..(t + 1) * nb].iter().sum();
+            for (term_id, term_count) in self.term_counts.iter_mut().enumerate() {
+                *term_count = self.counts
+                    [term_id * self.num_time_buckets..(term_id + 1) * self.num_time_buckets]
+                    .iter()
+                    .sum();
            }
        }
        let mut bucket_id_provider = BucketIdProvider::default();
@ -141,6 +144,11 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
        self.hist_block
            .fetch_block(docs, &self.hist_req_data.accessor);

+        // Hoist the loop-invariant fields into locals: the optimizer can't prove the
+        // `self.counts`/`self.term_counts` writes don't alias these `self` fields, so it can't keep
+        // them in registers and re-reads them from memory every iteration — ~15% slower on
+        // `terms_status_with_date_histogram` when read straight from `self`.
+        // Note: check which are actually relevant.
        let field_type = self.hist_req_data.field_type;
        let bounds = self.hist_req_data.bounds;
        let interval = self.hist_req_data.req.interval;
@ -155,10 +163,10 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
        if self.derive_term_counts_from_histogram {
            // No hard bounds: every doc lands in a bucket, so we skip the per-doc `term_counts`
            // increment (derived from the grid at flush) and the always-true bounds check.
-            for (term_id, hist_raw) in self.term_block.iter_vals().zip(self.hist_block.iter_vals())
+            for (term_id, hist_val) in self.term_block.iter_vals().zip(self.hist_block.iter_vals())
            {
                let term_id = term_id as usize;
-                let val = f64_from_fastfield_u64(hist_raw, field_type);
+                let val = f64_from_fastfield_u64(hist_val, field_type);
                let bucket = (get_bucket_pos_f64(val, interval, offset) as i64 - base_pos) as usize;
                debug_assert!(
                    bucket < num_time_buckets,
@ -233,9 +241,10 @@ pub(super) fn maybe_build_collector(
    let fuseable = is_top_level
        && terms_req_data.allowed_term_ids.is_none()
        && terms_req_data.accessor.get_cardinality().is_full()
-        // The flat counters are `u32`; a per-segment count can't exceed the doc count, so this
-        // guarantees no overflow (essentially always true, as `DocId` is `u32`).
-        && terms_req_data.accessor.num_docs() < u32::MAX
+        // The flat counters are `u32`, bumped once per value, so no count can exceed the column's
+        // value count. (Essentially always true here: the column is full, so its value count
+        // equals the doc count, and `DocId` is `u32`.)
+        && terms_req_data.accessor.values.num_vals() < u32::MAX
        && node.children.len() == 1
        && matches!(
            node.children[0].kind,
@ -476,4 +485,56 @@ mod tests {

        Ok(())
    }
+
+    /// Non-binding `hard_bounds` (wider than the data, with mid-interval edges) must still produce
+    /// exact results via the derive-from-grid path: since no doc is out of bounds, normalization
+    /// drops the bound, every doc lands in the dense range, and each term's total equals its
+    /// histogram row-sum. This is the case that previously fell back to the per-doc counter only
+    /// because `bounds != [MIN, MAX]`.
+    #[test]
+    fn fused_term_histogram_with_non_binding_hard_bounds() -> crate::Result<()> {
+        // 300 docs: term = {a, b, c} by i % 3, value = i % 20. Data values span [0, 19].
+        let docs: Vec<(f64, String)> = (0..300u64)
+            .map(|i| {
+                (
+                    (i % 20) as f64,
+                    ["a", "b", "c"][(i % 3) as usize].to_string(),
+                )
+            })
+            .collect();
+        let index = get_test_index_from_values_and_terms(true, &[docs])?;
+
+        // Bounds wider than [0, 19], with mid-interval edges -> they exclude nothing.
+        let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
+            "by_term": {
+                "terms": { "field": "string_id", "order": { "_key": "asc" } },
+                "aggs": {
+                    "histo": {
+                        "histogram": {
+                            "field": "score_f64",
+                            "interval": 1.0,
+                            "hard_bounds": { "min": -0.5, "max": 19.5 }
+                        }
+                    }
+                }
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+
+        for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
+            assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
+            // Every doc is in-bounds, so the per-term total is the full 100 (as without bounds).
+            assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
+            let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
+            for b in 0..20usize {
+                assert_eq!(histo[b]["key"], b as f64, "term {term} bucket {b}");
+                assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {b}");
+            }
+            assert_eq!(histo[20], serde_json::Value::Null);
+        }
+
+        Ok(())
+    }
 }