This commit is contained in:
Paul Masurel 2026-06-12 15:25:58 +02:00
commit ddcb2bc226
No known key found for this signature in database
7 changed files with 48 additions and 9 deletions

View file

@ -18,7 +18,7 @@ use crate::postings::{Postings, TermInfo};
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::term_query::TermScorer;
use crate::query::{box_scorer, Bm25Weight, BufferedUnionScorer, Scorer, SumCombiner};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, IndexRecordOption};
use crate::{DocId, InvertedIndexReader, Score};
/// Codecs describes how data is layed out on disk.
@ -44,6 +44,18 @@ pub trait Codec: Clone + std::fmt::Debug + Send + Sync + 'static {
/// Returns the positions codec.
fn positions_codec(&self) -> &Self::PositionsCodec;
/// Encodes per-document position deltas before they are written to the positions
/// file. The default leaves positions unchanged.
fn encode_position_deltas(
&self,
_field: Field,
_doc_id: DocId,
position_deltas: &[u32],
output: &mut Vec<u32>,
) {
output.extend_from_slice(position_deltas);
}
}
/// Object-safe codec is a Codec that can be used in a trait object.

View file

@ -8,7 +8,7 @@ use crate::codec::positions::PositionsReader;
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::{Bm25Weight, Scorer};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, IndexRecordOption};
use crate::{DocId, Score};
/// Postings codec.
@ -41,6 +41,7 @@ pub trait PostingsCodec: Send + Sync + 'static {
/// It is already opened by the caller via the codec's `PositionsCodec`.
fn load_postings(
&self,
field: Field,
doc_freq: u32,
postings_data: OwnedBytes,
record_option: IndexRecordOption,

View file

@ -8,7 +8,7 @@ pub use crate::codec::standard::postings::segment_postings::SegmentPostings;
use crate::fieldnorm::FieldNormReader;
use crate::query::term_query::TermScorer;
use crate::query::{BufferedUnionScorer, Scorer, SumCombiner};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, IndexRecordOption};
use crate::{DocSet as _, Score, TERMINATED};
mod block;
@ -46,6 +46,7 @@ impl PostingsCodec for StandardPostingsCodec {
fn load_postings(
&self,
_field: Field,
doc_freq: u32,
postings_data: common::OwnedBytes,
record_option: IndexRecordOption,
@ -126,6 +127,7 @@ mod tests {
.unwrap();
StandardPostingsCodec
.load_postings(
Field::from_field_id(0),
num_docs,
OwnedBytes::new(buffer),
IndexRecordOption::WithFreqs,

View file

@ -18,7 +18,7 @@ use crate::fieldnorm::FieldNormReader;
use crate::postings::{Postings, TermInfo};
use crate::query::term_query::TermScorer;
use crate::query::{Bm25Weight, PhraseScorer, Scorer};
use crate::schema::{IndexRecordOption, Term, Type};
use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::TermDictionary;
/// The inverted index reader is in charge of accessing
@ -34,6 +34,7 @@ use crate::termdict::TermDictionary;
/// `InvertedIndexReader` are created by calling
/// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index).
pub struct InvertedIndexReader {
field: Field,
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
@ -71,6 +72,7 @@ impl InvertedIndexFieldSpace {
impl InvertedIndexReader {
pub(crate) fn new(
field: Field,
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
@ -80,6 +82,7 @@ impl InvertedIndexReader {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
Ok(InvertedIndexReader {
field,
termdict,
postings_file_slice: postings_body,
positions_file_slice,
@ -91,8 +94,9 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
pub fn empty(field: Field, record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
field,
termdict: TermDictionary::empty(),
postings_file_slice: FileSlice::empty(),
positions_file_slice: FileSlice::empty(),
@ -256,6 +260,7 @@ impl InvertedIndexReader {
};
let postings: <<C as Codec>::PostingsCodec as PostingsCodec>::Postings =
codec.postings_codec().load_postings(
self.field,
term_info.doc_freq,
postings_data,
self.record_option,

View file

@ -252,7 +252,7 @@ impl SegmentReader {
//
// Returns an empty inverted index.
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
return Ok(Arc::new(InvertedIndexReader::empty(field, record_option)));
}
let record_option = record_option_opt.unwrap();
@ -277,6 +277,7 @@ impl SegmentReader {
})?;
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
field,
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,

View file

@ -119,7 +119,7 @@ mod tests {
};
use crate::codec::Codec;
use crate::fieldnorm::FieldNormReader;
use crate::schema::{IndexRecordOption, Schema, Type, STRING};
use crate::schema::{Field, IndexRecordOption, Schema, Type, STRING};
use crate::{DocId, Score, Term};
// The codec is round-tripped through `from_json_props` when the index is
@ -186,6 +186,7 @@ mod tests {
fn load_postings(
&self,
field: Field,
doc_freq: u32,
postings_data: common::OwnedBytes,
record_option: IndexRecordOption,
@ -193,6 +194,7 @@ mod tests {
position_reader: Option<Box<dyn PositionsReader>>,
) -> io::Result<Self::Postings> {
StandardPostingsCodec.load_postings(
field,
doc_freq,
postings_data,
record_option,

View file

@ -84,6 +84,7 @@ impl<C: Codec> InvertedIndexSerializer<C> {
let postings_write = self.postings_write.for_field(field);
let positions_write = self.positions_write.for_field(field);
FieldSerializer::create(
field,
field_entry.field_type(),
total_num_tokens,
term_dictionary_write,
@ -106,10 +107,13 @@ impl<C: Codec> InvertedIndexSerializer<C> {
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a, C: Codec> {
field: Field,
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
postings_serializer: <C::PostingsCodec as PostingsCodec>::PostingsSerializer,
positions_serializer_opt:
Option<<C::PositionsCodec as PositionsCodec>::Serializer<&'a mut CountingWriter<WritePtr>>>,
codec: &'a C,
position_delta_buffer: Vec<u32>,
current_term_info: TermInfo,
term_open: bool,
postings_write: &'a mut CountingWriter<WritePtr>,
@ -118,13 +122,14 @@ pub struct FieldSerializer<'a, C: Codec> {
impl<'a, C: Codec> FieldSerializer<'a, C> {
fn create(
field: Field,
field_type: &FieldType,
total_num_tokens: u64,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
codec: &C,
codec: &'a C,
) -> io::Result<FieldSerializer<'a, C>> {
let index_record_option = field_type
.index_record_option()
@ -148,9 +153,12 @@ impl<'a, C: Codec> FieldSerializer<'a, C> {
let postings_start_offset = postings_write.written_bytes();
Ok(FieldSerializer {
field,
term_dictionary_builder,
postings_serializer,
positions_serializer_opt,
codec,
position_delta_buffer: Vec::new(),
current_term_info: TermInfo::default(),
term_open: false,
postings_write,
@ -225,7 +233,15 @@ impl<'a, C: Codec> FieldSerializer<'a, C> {
self.postings_serializer.write_doc(doc_id, term_freq);
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
assert_eq!(term_freq as usize, position_deltas.len());
positions_serializer.write_positions_delta(position_deltas);
self.position_delta_buffer.clear();
self.codec.encode_position_deltas(
self.field,
doc_id,
position_deltas,
&mut self.position_delta_buffer,
);
assert_eq!(term_freq as usize, self.position_delta_buffer.len());
positions_serializer.write_positions_delta(&self.position_delta_buffer);
}
}