mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-19 09:16:45 +00:00
blop
This commit is contained in:
parent
d472110333
commit
ddcb2bc226
7 changed files with 48 additions and 9 deletions
|
|
@ -18,7 +18,7 @@ use crate::postings::{Postings, TermInfo};
|
|||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{box_scorer, Bm25Weight, BufferedUnionScorer, Scorer, SumCombiner};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::{DocId, InvertedIndexReader, Score};
|
||||
|
||||
/// Codecs describes how data is layed out on disk.
|
||||
|
|
@ -44,6 +44,18 @@ pub trait Codec: Clone + std::fmt::Debug + Send + Sync + 'static {
|
|||
|
||||
/// Returns the positions codec.
|
||||
fn positions_codec(&self) -> &Self::PositionsCodec;
|
||||
|
||||
/// Encodes per-document position deltas before they are written to the positions
|
||||
/// file. The default leaves positions unchanged.
|
||||
fn encode_position_deltas(
|
||||
&self,
|
||||
_field: Field,
|
||||
_doc_id: DocId,
|
||||
position_deltas: &[u32],
|
||||
output: &mut Vec<u32>,
|
||||
) {
|
||||
output.extend_from_slice(position_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
/// Object-safe codec is a Codec that can be used in a trait object.
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use crate::codec::positions::PositionsReader;
|
|||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Postings codec.
|
||||
|
|
@ -41,6 +41,7 @@ pub trait PostingsCodec: Send + Sync + 'static {
|
|||
/// It is already opened by the caller via the codec's `PositionsCodec`.
|
||||
fn load_postings(
|
||||
&self,
|
||||
field: Field,
|
||||
doc_freq: u32,
|
||||
postings_data: OwnedBytes,
|
||||
record_option: IndexRecordOption,
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ pub use crate::codec::standard::postings::segment_postings::SegmentPostings;
|
|||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{BufferedUnionScorer, Scorer, SumCombiner};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::{DocSet as _, Score, TERMINATED};
|
||||
|
||||
mod block;
|
||||
|
|
@ -46,6 +46,7 @@ impl PostingsCodec for StandardPostingsCodec {
|
|||
|
||||
fn load_postings(
|
||||
&self,
|
||||
_field: Field,
|
||||
doc_freq: u32,
|
||||
postings_data: common::OwnedBytes,
|
||||
record_option: IndexRecordOption,
|
||||
|
|
@ -126,6 +127,7 @@ mod tests {
|
|||
.unwrap();
|
||||
StandardPostingsCodec
|
||||
.load_postings(
|
||||
Field::from_field_id(0),
|
||||
num_docs,
|
||||
OwnedBytes::new(buffer),
|
||||
IndexRecordOption::WithFreqs,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ use crate::fieldnorm::FieldNormReader;
|
|||
use crate::postings::{Postings, TermInfo};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{Bm25Weight, PhraseScorer, Scorer};
|
||||
use crate::schema::{IndexRecordOption, Term, Type};
|
||||
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
||||
use crate::termdict::TermDictionary;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
|
|
@ -34,6 +34,7 @@ use crate::termdict::TermDictionary;
|
|||
/// `InvertedIndexReader` are created by calling
|
||||
/// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index).
|
||||
pub struct InvertedIndexReader {
|
||||
field: Field,
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
|
|
@ -71,6 +72,7 @@ impl InvertedIndexFieldSpace {
|
|||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
field: Field,
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
|
|
@ -80,6 +82,7 @@ impl InvertedIndexReader {
|
|||
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
|
||||
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
|
||||
Ok(InvertedIndexReader {
|
||||
field,
|
||||
termdict,
|
||||
postings_file_slice: postings_body,
|
||||
positions_file_slice,
|
||||
|
|
@ -91,8 +94,9 @@ impl InvertedIndexReader {
|
|||
|
||||
/// Creates an empty `InvertedIndexReader` object, which
|
||||
/// contains no terms at all.
|
||||
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
|
||||
pub fn empty(field: Field, record_option: IndexRecordOption) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
field,
|
||||
termdict: TermDictionary::empty(),
|
||||
postings_file_slice: FileSlice::empty(),
|
||||
positions_file_slice: FileSlice::empty(),
|
||||
|
|
@ -256,6 +260,7 @@ impl InvertedIndexReader {
|
|||
};
|
||||
let postings: <<C as Codec>::PostingsCodec as PostingsCodec>::Postings =
|
||||
codec.postings_codec().load_postings(
|
||||
self.field,
|
||||
term_info.doc_freq,
|
||||
postings_data,
|
||||
self.record_option,
|
||||
|
|
|
|||
|
|
@ -252,7 +252,7 @@ impl SegmentReader {
|
|||
//
|
||||
// Returns an empty inverted index.
|
||||
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
|
||||
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
|
||||
return Ok(Arc::new(InvertedIndexReader::empty(field, record_option)));
|
||||
}
|
||||
|
||||
let record_option = record_option_opt.unwrap();
|
||||
|
|
@ -277,6 +277,7 @@ impl SegmentReader {
|
|||
})?;
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
field,
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
positions_file,
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ mod tests {
|
|||
};
|
||||
use crate::codec::Codec;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STRING};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, Type, STRING};
|
||||
use crate::{DocId, Score, Term};
|
||||
|
||||
// The codec is round-tripped through `from_json_props` when the index is
|
||||
|
|
@ -186,6 +186,7 @@ mod tests {
|
|||
|
||||
fn load_postings(
|
||||
&self,
|
||||
field: Field,
|
||||
doc_freq: u32,
|
||||
postings_data: common::OwnedBytes,
|
||||
record_option: IndexRecordOption,
|
||||
|
|
@ -193,6 +194,7 @@ mod tests {
|
|||
position_reader: Option<Box<dyn PositionsReader>>,
|
||||
) -> io::Result<Self::Postings> {
|
||||
StandardPostingsCodec.load_postings(
|
||||
field,
|
||||
doc_freq,
|
||||
postings_data,
|
||||
record_option,
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ impl<C: Codec> InvertedIndexSerializer<C> {
|
|||
let postings_write = self.postings_write.for_field(field);
|
||||
let positions_write = self.positions_write.for_field(field);
|
||||
FieldSerializer::create(
|
||||
field,
|
||||
field_entry.field_type(),
|
||||
total_num_tokens,
|
||||
term_dictionary_write,
|
||||
|
|
@ -106,10 +107,13 @@ impl<C: Codec> InvertedIndexSerializer<C> {
|
|||
/// The field serializer is in charge of
|
||||
/// the serialization of a specific field.
|
||||
pub struct FieldSerializer<'a, C: Codec> {
|
||||
field: Field,
|
||||
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
|
||||
postings_serializer: <C::PostingsCodec as PostingsCodec>::PostingsSerializer,
|
||||
positions_serializer_opt:
|
||||
Option<<C::PositionsCodec as PositionsCodec>::Serializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
codec: &'a C,
|
||||
position_delta_buffer: Vec<u32>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
|
|
@ -118,13 +122,14 @@ pub struct FieldSerializer<'a, C: Codec> {
|
|||
|
||||
impl<'a, C: Codec> FieldSerializer<'a, C> {
|
||||
fn create(
|
||||
field: Field,
|
||||
field_type: &FieldType,
|
||||
total_num_tokens: u64,
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
codec: &C,
|
||||
codec: &'a C,
|
||||
) -> io::Result<FieldSerializer<'a, C>> {
|
||||
let index_record_option = field_type
|
||||
.index_record_option()
|
||||
|
|
@ -148,9 +153,12 @@ impl<'a, C: Codec> FieldSerializer<'a, C> {
|
|||
|
||||
let postings_start_offset = postings_write.written_bytes();
|
||||
Ok(FieldSerializer {
|
||||
field,
|
||||
term_dictionary_builder,
|
||||
postings_serializer,
|
||||
positions_serializer_opt,
|
||||
codec,
|
||||
position_delta_buffer: Vec::new(),
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
postings_write,
|
||||
|
|
@ -225,7 +233,15 @@ impl<'a, C: Codec> FieldSerializer<'a, C> {
|
|||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
assert_eq!(term_freq as usize, position_deltas.len());
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
self.position_delta_buffer.clear();
|
||||
self.codec.encode_position_deltas(
|
||||
self.field,
|
||||
doc_id,
|
||||
position_deltas,
|
||||
&mut self.position_delta_buffer,
|
||||
);
|
||||
assert_eq!(term_freq as usize, self.position_delta_buffer.len());
|
||||
positions_serializer.write_positions_delta(&self.position_delta_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue