Add BlockSegmentPostings::rank() for skip-list-based positional counting

Add a public rank(target) method on BlockSegmentPostings that returns the
number of docs with a doc id strictly smaller than target. It jumps to the
candidate block through the skip list and decodes a single block, so the cost
is O(skip-list entries) + one block decode rather than O(doc_freq).

This is a useful primitive for range counting over a posting list (e.g. number
of matches in a [lo, hi) doc-id window) without iterating every matched doc.

To support it, expose SkipReader::remaining_docs() (pub(crate)). Like seek(),
rank() advances the cursor forward only and must be called with non-decreasing,
valid (<= TERMINATED) targets. Adds a unit test covering multi-block lists and
the below-first / above-last / empty edge cases.
This commit is contained in:
Windforce17 2026-06-13 02:42:04 +08:00 committed by PSeitz
commit 1d06328cb3
2 changed files with 67 additions and 0 deletions

View file

@ -287,6 +287,33 @@ impl BlockSegmentPostings {
doc
}
/// Returns the number of documents with a doc id strictly smaller than `target`
/// (i.e. the *rank* of `target` in this posting list).
///
/// This jumps to the block that may contain `target` through the skip list, so no
/// skipped block is decoded; a single block is then decoded to locate `target`
/// within it. The cost is therefore `O(number_of_skip_list_entries)` plus one block
/// decode, rather than `O(doc_freq)`.
///
/// Like [`Self::seek`], the underlying cursor only ever moves forward. This method
/// must be called with **non-decreasing** `target` values (galloping); calling it
/// with a `target` smaller than a previous one yields an incorrect result. `target`
/// must be a valid doc id (i.e. `target <= TERMINATED`), exactly as for `seek`.
///
/// Edge cases: returns `0` when `target` is smaller than every doc id, and
/// `doc_freq()` when `target` is larger than every doc id.
pub fn rank(&mut self, target: DocId) -> u32 {
if self.doc_freq == 0 {
return 0;
}
// `within` = number of docs in the landed block with a doc id < target.
let within = self.seek(target);
// `remaining_docs` counts the landed block and everything after it, so the
// difference is the number of docs in all blocks strictly before it.
let docs_before_block = self.doc_freq - self.skip_reader.remaining_docs();
docs_before_block + within as u32
}
pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
@ -568,4 +595,38 @@ mod tests {
assert_eq!(block_segments.docs(), &[1, 3, 5]);
Ok(())
}
#[test]
fn test_block_segment_postings_rank() -> crate::Result<()> {
// ~8 blocks worth of docs so the skip list is actually exercised.
let docs: Vec<DocId> = (0..1000u32).map(|i| i * 3).collect();
let mut block_postings = build_block_postings(&docs[..])?;
let doc_freq = block_postings.doc_freq();
// rank(target) must equal the number of docs strictly below target.
// Targets are queried in non-decreasing order, as the API requires.
// `target` values must be a valid doc id (<= TERMINATED) and non-decreasing.
let targets = [
0u32, 1, 2, 3, 4, 299, 300, 301, 1500, 2996, 2997, 3000, 10_000,
];
for &target in &targets {
let expected = docs.iter().filter(|&&d| d < target).count() as u32;
assert_eq!(
block_postings.rank(target),
expected,
"rank({target}) mismatch"
);
}
// Edge cases: below the first doc -> 0, above the last doc -> doc_freq.
let mut fresh = build_block_postings(&docs[..])?;
assert_eq!(fresh.rank(0), 0);
let mut fresh = build_block_postings(&docs[..])?;
assert_eq!(fresh.rank(1_000_000), doc_freq);
// Empty postings: rank is always 0.
let mut empty = BlockSegmentPostings::empty();
assert_eq!(empty.rank(42), 0);
Ok(())
}
}

View file

@ -187,6 +187,12 @@ impl SkipReader {
self.last_doc_in_block
}
/// Number of docs from the start of the current block to the end of the postings
/// (i.e. the current block plus every block after it).
pub(crate) fn remaining_docs(&self) -> u32 {
self.remaining_docs
}
pub fn position_offset(&self) -> u64 {
self.position_offset
}