mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-19 09:16:45 +00:00
skip hard_bounds that can't exclude any value
When a histogram's hard_bounds are wider than the column's value range, the per-doc `bounds.contains` check can never fail. Collapse such bounds to the unbounded sentinel in `normalize_histogram_req`, so both the general histogram hot loop and the fused term×histogram path skip the check — the latter then derives per-term counts from the grid (the ~17% win) instead of falling back to per-doc counting just because `bounds != [MIN, MAX]`. Only the collect-time filter is affected: empty-bucket emission reads `req.hard_bounds` directly, and hard_bounds only ever clips that range, so a wider-than-data bound leaves results unchanged. Covered by new tests on the general and fused paths, including mid-interval (bucket-splitting) bounds. Also tighten the fused-path u32-overflow guard to bound on `num_vals()` (the per-value increment count) rather than `num_docs()`, and document why the fused collector's hot-loop fields are hoisted into locals (re-reading them from memory each iteration measured ~15% slower).
This commit is contained in:
parent
9f7aea4765
commit
ef13489d63
2 changed files with 136 additions and 10 deletions
|
|
@ -682,6 +682,22 @@ fn normalize_histogram_req(req_data: &mut HistogramAggReqData) -> crate::Result<
|
|||
max: f64::MAX,
|
||||
});
|
||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||
// Drop `hard_bounds` that can't exclude any value (the column's range already sits inside
|
||||
// them): the per-doc `bounds.contains` check is then a no-op, so collapsing to the unbounded
|
||||
// sentinel lets the histogram hot loop skip it and the fused term×histogram path derive
|
||||
// per-term counts from the grid. Only this collect-time filter is touched — empty-bucket
|
||||
// emission reads `req.hard_bounds` directly (see `get_req_min_max`), and `hard_bounds` only
|
||||
// ever clips that range, so a wider-than-data bound leaves the result unchanged.
|
||||
if req_data.req.hard_bounds.is_some() {
|
||||
let col_min = f64_from_fastfield_u64(req_data.accessor.min_value(), req_data.field_type);
|
||||
let col_max = f64_from_fastfield_u64(req_data.accessor.max_value(), req_data.field_type);
|
||||
if col_min >= req_data.bounds.min && col_max <= req_data.bounds.max {
|
||||
req_data.bounds = HistogramBounds {
|
||||
min: f64::MIN,
|
||||
max: f64::MAX,
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -1409,6 +1425,55 @@ mod tests {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_non_binding_hard_bounds_test_multi_segment() -> crate::Result<()> {
|
||||
histogram_non_binding_hard_bounds_test_with_opt(false)
|
||||
}
|
||||
#[test]
|
||||
fn histogram_non_binding_hard_bounds_test_single_segment() -> crate::Result<()> {
|
||||
histogram_non_binding_hard_bounds_test_with_opt(true)
|
||||
}
|
||||
/// `hard_bounds` wider than the data (here with mid-interval edges, to cover the "bound cuts a
|
||||
/// bucket" case) can't exclude any value, so the result must be identical to the same request
|
||||
/// without bounds. Guards the normalization that collapses such bounds to the unbounded
|
||||
/// sentinel so the hot loop / fused path can skip the per-doc bounds check.
|
||||
fn histogram_non_binding_hard_bounds_test_with_opt(merge_segments: bool) -> crate::Result<()> {
|
||||
let values = vec![10.0, 12.0, 14.0, 16.0, 10.0, 13.0, 10.0, 12.0];
|
||||
let index = get_test_index_from_values(merge_segments, &values)?;
|
||||
|
||||
// Mid-interval edges, but wider than the data range [10, 16] -> they exclude nothing.
|
||||
let with_bounds: Aggregations = serde_json::from_value(json!({
|
||||
"histogram": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": 9.5, "max": 16.5 }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let no_bounds: Aggregations = serde_json::from_value(json!({
|
||||
"histogram": {
|
||||
"histogram": { "field": "score_f64", "interval": 1.0 }
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res_bounds = exec_request(with_bounds, &index)?;
|
||||
let res_plain = exec_request(no_bounds, &index)?;
|
||||
// Dropping a non-binding bound must not change anything.
|
||||
assert_eq!(res_bounds, res_plain);
|
||||
|
||||
// Sanity: buckets span the data range with gaps filled (min_doc_count defaults to 0).
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][0]["key"], 10.0);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][0]["doc_count"], 3);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][6]["key"], 16.0);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][6]["doc_count"], 1);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][7], Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_empty_result_behaviour_test_single_segment() -> crate::Result<()> {
|
||||
histogram_empty_result_behaviour_test_with_opt(true)
|
||||
|
|
|
|||
|
|
@ -22,7 +22,9 @@ use crate::aggregation::{f64_from_fastfield_u64, BucketId};
|
|||
/// Maximum number of cells (`num_terms × num_time_buckets`) in the fused flat 2D grid. Above this
|
||||
/// the grid would be too large/cache-unfriendly, so we fall back to the general buffered path.
|
||||
/// `1 << 14` cells = 128 KB of `u64` counters, comfortably L2-resident.
|
||||
const MAX_FUSED_GRID_BUCKETS: usize = 1 << 14;
|
||||
///
|
||||
/// Since we are only at the top-level, this won't be multiplied by any parent buckets.
|
||||
const MAX_FUSED_GRID_BUCKETS: usize = 16384;
|
||||
|
||||
/// Fused collector for `terms` (low cardinality) × a single `histogram`/`date_histogram` leaf with
|
||||
/// nothing nested below it, when the resulting `num_terms × num_time_buckets` grid is small (see
|
||||
|
|
@ -87,10 +89,11 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
|
|||
if self.derive_term_counts_from_histogram {
|
||||
// `collect` skipped the per-doc total; recover it as the grid row-sum (every doc landed
|
||||
// in a bucket, since there are no hard bounds).
|
||||
let nb = self.num_time_buckets;
|
||||
let counts = &self.counts;
|
||||
for (t, slot) in self.term_counts.iter_mut().enumerate() {
|
||||
*slot = counts[t * nb..(t + 1) * nb].iter().sum();
|
||||
for (term_id, term_count) in self.term_counts.iter_mut().enumerate() {
|
||||
*term_count = self.counts
|
||||
[term_id * self.num_time_buckets..(term_id + 1) * self.num_time_buckets]
|
||||
.iter()
|
||||
.sum();
|
||||
}
|
||||
}
|
||||
let mut bucket_id_provider = BucketIdProvider::default();
|
||||
|
|
@ -141,6 +144,11 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
|
|||
self.hist_block
|
||||
.fetch_block(docs, &self.hist_req_data.accessor);
|
||||
|
||||
// Hoist the loop-invariant fields into locals: the optimizer can't prove the
|
||||
// `self.counts`/`self.term_counts` writes don't alias these `self` fields, so it can't keep
|
||||
// them in registers and re-reads them from memory every iteration — ~15% slower on
|
||||
// `terms_status_with_date_histogram` when read straight from `self`.
|
||||
// Note: check which are actually relevant.
|
||||
let field_type = self.hist_req_data.field_type;
|
||||
let bounds = self.hist_req_data.bounds;
|
||||
let interval = self.hist_req_data.req.interval;
|
||||
|
|
@ -155,10 +163,10 @@ impl SegmentAggregationCollector for SegmentTermHistogramCollector {
|
|||
if self.derive_term_counts_from_histogram {
|
||||
// No hard bounds: every doc lands in a bucket, so we skip the per-doc `term_counts`
|
||||
// increment (derived from the grid at flush) and the always-true bounds check.
|
||||
for (term_id, hist_raw) in self.term_block.iter_vals().zip(self.hist_block.iter_vals())
|
||||
for (term_id, hist_val) in self.term_block.iter_vals().zip(self.hist_block.iter_vals())
|
||||
{
|
||||
let term_id = term_id as usize;
|
||||
let val = f64_from_fastfield_u64(hist_raw, field_type);
|
||||
let val = f64_from_fastfield_u64(hist_val, field_type);
|
||||
let bucket = (get_bucket_pos_f64(val, interval, offset) as i64 - base_pos) as usize;
|
||||
debug_assert!(
|
||||
bucket < num_time_buckets,
|
||||
|
|
@ -233,9 +241,10 @@ pub(super) fn maybe_build_collector(
|
|||
let fuseable = is_top_level
|
||||
&& terms_req_data.allowed_term_ids.is_none()
|
||||
&& terms_req_data.accessor.get_cardinality().is_full()
|
||||
// The flat counters are `u32`; a per-segment count can't exceed the doc count, so this
|
||||
// guarantees no overflow (essentially always true, as `DocId` is `u32`).
|
||||
&& terms_req_data.accessor.num_docs() < u32::MAX
|
||||
// The flat counters are `u32`, bumped once per value, so no count can exceed the column's
|
||||
// value count. (Essentially always true here: the column is full, so its value count
|
||||
// equals the doc count, and `DocId` is `u32`.)
|
||||
&& terms_req_data.accessor.values.num_vals() < u32::MAX
|
||||
&& node.children.len() == 1
|
||||
&& matches!(
|
||||
node.children[0].kind,
|
||||
|
|
@ -476,4 +485,56 @@ mod tests {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Non-binding `hard_bounds` (wider than the data, with mid-interval edges) must still produce
|
||||
/// exact results via the derive-from-grid path: since no doc is out of bounds, normalization
|
||||
/// drops the bound, every doc lands in the dense range, and each term's total equals its
|
||||
/// histogram row-sum. This is the case that previously fell back to the per-doc counter only
|
||||
/// because `bounds != [MIN, MAX]`.
|
||||
#[test]
|
||||
fn fused_term_histogram_with_non_binding_hard_bounds() -> crate::Result<()> {
|
||||
// 300 docs: term = {a, b, c} by i % 3, value = i % 20. Data values span [0, 19].
|
||||
let docs: Vec<(f64, String)> = (0..300u64)
|
||||
.map(|i| {
|
||||
(
|
||||
(i % 20) as f64,
|
||||
["a", "b", "c"][(i % 3) as usize].to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
// Bounds wider than [0, 19], with mid-interval edges -> they exclude nothing.
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": -0.5, "max": 19.5 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
|
||||
// Every doc is in-bounds, so the per-term total is the full 100 (as without bounds).
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
|
||||
let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
|
||||
for b in 0..20usize {
|
||||
assert_eq!(histo[b]["key"], b as f64, "term {term} bucket {b}");
|
||||
assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {b}");
|
||||
}
|
||||
assert_eq!(histo[20], serde_json::Value::Null);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue