This commit is contained in:
Ibiyemi Abiodun 2026-06-19 16:10:45 +08:00 committed by GitHub
commit 4eeddca37f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 174 additions and 57 deletions

View file

@ -59,8 +59,11 @@ fn main() -> tantivy::Result<()> {
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
if let Some(snippet) = snippet {
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}
}
Ok(())

View file

@ -42,7 +42,7 @@
//! # let searcher = reader.searcher();
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
//! snippet_generator.set_max_num_chars(100);
//! let snippet = snippet_generator.snippet_from_doc(&doc);
//! let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
//! let snippet_html: String = snippet.to_html();
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
//! # Ok(())
@ -115,6 +115,7 @@ impl FragmentCandidate {
#[derive(Debug)]
pub struct Snippet {
fragment: String,
fragment_range: Range<usize>,
highlighted: Vec<Range<usize>>,
snippet_prefix: String,
snippet_postfix: String,
@ -122,30 +123,16 @@ pub struct Snippet {
impl Snippet {
/// Create a new `Snippet`.
fn new(fragment: &str, highlighted: Vec<Range<usize>>) -> Self {
fn new(source_str: &str, source_range: Range<usize>, highlighted: Vec<Range<usize>>) -> Self {
Self {
fragment: fragment.to_string(),
fragment: source_str[source_range.clone()].to_string(),
fragment_range: source_range,
highlighted,
snippet_prefix: DEFAULT_SNIPPET_PREFIX.to_string(),
snippet_postfix: DEFAULT_SNIPPET_POSTFIX.to_string(),
}
}
/// Create a new, empty, `Snippet`.
pub fn empty() -> Snippet {
Snippet {
fragment: String::new(),
highlighted: Vec::new(),
snippet_prefix: String::new(),
snippet_postfix: String::new(),
}
}
/// Returns `true` if the snippet is empty.
pub fn is_empty(&self) -> bool {
self.highlighted.len() == 0
}
/// Returns a highlighted html from the `Snippet`.
pub fn to_html(&self) -> String {
let mut html = String::new();
@ -169,6 +156,12 @@ impl Snippet {
&self.fragment
}
/// Returns the range of the original text that the fragment was extracted
/// from.
pub fn range(&self) -> Range<usize> {
self.fragment_range.clone()
}
/// Returns a list of highlighted positions from the `Snippet`.
pub fn highlighted(&self) -> &[Range<usize>] {
&self.highlighted
@ -231,7 +224,10 @@ fn search_fragments(
///
/// Takes a vector of `FragmentCandidate`s and the text.
/// Figures out the best fragment from it and creates a snippet.
fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet {
fn select_best_fragment_combination(
fragments: &[FragmentCandidate],
text: &str,
) -> Option<Snippet> {
let best_fragment_opt = fragments.iter().max_by(|left, right| {
let cmp_score = left
.score
@ -243,18 +239,21 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
cmp_score
}
});
if let Some(fragment) = best_fragment_opt {
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
let highlighted = fragment
.highlighted
.iter()
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
.collect();
Snippet::new(fragment_text, highlighted)
} else {
// When there are no fragments to chose from,
// for now create an empty snippet.
Snippet::empty()
match best_fragment_opt {
Some(fragment) => {
let highlighted = fragment
.highlighted
.iter()
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
.collect();
Some(Snippet::new(
text,
fragment.start_offset..fragment.stop_offset,
highlighted,
))
}
None => None,
}
}
@ -368,7 +367,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
/// # let searcher = reader.searcher();
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
/// snippet_generator.set_max_num_chars(100);
/// let snippet = snippet_generator.snippet_from_doc(&doc);
/// let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
/// let snippet_html: String = snippet.to_html();
/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
/// # Ok(())
@ -441,11 +440,7 @@ impl SnippetGenerator {
&self.terms_text
}
/// Generates a snippet for the given `Document`.
///
/// This method extract the text associated with the `SnippetGenerator`'s field
/// and computes a snippet.
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Snippet {
fn text_from_doc<D: Document>(&self, doc: &D) -> String {
let mut text = String::new();
for (field, value) in doc.iter_fields_and_values() {
let value = value as D::Value<'_>;
@ -459,19 +454,68 @@ impl SnippetGenerator {
}
}
self.snippet(text.trim())
text
}
/// Generates a snippet for the given `Document`.
///
/// This method extract the text associated with the `SnippetGenerator`'s field
/// and computes a snippet.
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Option<Snippet> {
self.snippet(self.text_from_doc(doc).trim())
}
/// Generates snippets for the given `Document`.
///
/// This method extract the text associated with the `SnippetGenerator`'s field
/// and computes snippets.
pub fn snippets_from_doc<D: Document>(&self, doc: &D) -> Vec<Snippet> {
self.snippets(self.text_from_doc(doc).trim())
}
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
pub fn snippet(&self, text: &str) -> Option<Snippet> {
let fragment_candidates = search_fragments(
&mut self.tokenizer.clone(),
text,
&self.terms_text,
self.max_num_chars,
);
select_best_fragment_combination(&fragment_candidates[..], text)
}
/// Generates a snippet for the given text.
pub fn snippets(&self, text: &str) -> Vec<Snippet> {
let fragment_candidates = search_fragments(
&mut self.tokenizer.clone(),
text,
&self.terms_text,
self.max_num_chars,
);
let snippets = fragment_candidates
.iter()
.filter(|f| f.score > 0.0)
.map(|fragment| {
let highlighted = fragment
.highlighted
.iter()
.map(|item| {
item.start - fragment.start_offset..item.end - fragment.start_offset
})
.collect();
Snippet::new(
text,
fragment.start_offset..fragment.stop_offset,
highlighted,
)
})
.collect();
snippets
}
}
#[cfg(test)]
@ -520,7 +564,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.9);
assert_eq!(first.stop_offset, 89);
}
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(
snippet.fragment,
"Rust is a systems programming language sponsored by\nMozilla which describes it as a \
@ -551,7 +595,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.0);
assert_eq!(first.stop_offset, 17);
}
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
}
{
@ -571,7 +615,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 0.9);
assert_eq!(first.stop_offset, 17);
}
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(snippet.to_html(), "programming <b>language</b>")
}
}
@ -594,7 +638,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.stop_offset, 7);
}
let snippet = select_best_fragment_combination(&fragments[..], text);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "c d");
assert_eq!(snippet.to_html(), "<b>c</b> d");
}
@ -617,7 +661,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.start_offset, 8);
}
let snippet = select_best_fragment_combination(&fragments[..], text);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "e f");
assert_eq!(snippet.to_html(), "e <b>f</b>");
}
@ -641,7 +685,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.start_offset, 0);
}
let snippet = select_best_fragment_combination(&fragments[..], text);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "e f g");
assert_eq!(snippet.to_html(), "e <b>f</b> g");
}
@ -659,9 +703,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragment, "");
assert_eq!(snippet.to_html(), "");
assert!(snippet.is_empty());
assert!(snippet.is_none());
}
#[test]
@ -674,9 +716,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragment, "");
assert_eq!(snippet.to_html(), "");
assert!(snippet.is_empty());
assert!(snippet.is_none());
}
#[test]
@ -753,7 +793,7 @@ Survey in 2016, 2017, and 2018."#;
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
{
let snippet = snippet_generator.snippet(TEST_TEXT);
let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"imperative-procedural paradigms. <b>Rust</b> is syntactically similar to \
@ -763,7 +803,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
snippet_generator.set_max_num_chars(90);
let snippet = snippet_generator.snippet(TEST_TEXT);
let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its \
@ -796,7 +836,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.stop_offset, 3);
}
let snippet = select_best_fragment_combination(&fragments[..], text);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "abc");
assert_eq!(snippet.to_html(), "<b>abc</b>");
}
@ -810,7 +850,7 @@ Survey in 2016, 2017, and 2018."#;
&terms,
100,
);
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"<b>Rust</b> is a systems programming <b>language</b> sponsored by\nMozilla which \
@ -824,6 +864,80 @@ Survey in 2016, 2017, and 2018."#;
);
}
#[test]
fn test_snippet_absolute_offsets() {
let text = "First sentence. The quick brown fox jumps over the lazy dog. Last sentence.";
let terms = btreemap! {
String::from("fox") => 1.0,
String::from("dog") => 0.9
};
let fragments = search_fragments(
&mut From::from(SimpleTokenizer::default()),
text,
&terms,
100,
);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
// verify fragment range points to correct substring
// max_num_chars is 100, so our fragment should be the entire text
assert_eq!(snippet.fragment_range, 0..text.len() - 1);
assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
// verify highlighted ranges are correct relative to original text
let absolute_highlights: Vec<&str> = snippet
.highlighted
.iter()
.map(|highlight| {
(highlight.start + snippet.fragment_range.start)
..(highlight.end + snippet.fragment_range.start)
})
.map(|range| &text[range])
.collect();
// "fox" and "dog" positions in original text
assert!(absolute_highlights.contains(&"fox")); // "fox"
assert!(absolute_highlights.contains(&"dog")); // "dog"
}
#[test]
fn test_snippet_absolute_offsets_with_truncation() {
let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox \
jumps again. End text.";
let terms = btreemap! {
String::from("fox") => 1.0,
String::from("quick") => 0.9
};
let fragments = search_fragments(
&mut From::from(SimpleTokenizer::default()),
text,
&terms,
30, // short max chars to force truncation
);
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
// verify fragment range points to correct substring
assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
// verify highlighted ranges are correct relative to original text
let absolute_highlights: Vec<&str> = snippet
.highlighted
.iter()
.map(|range| {
(range.start + snippet.fragment_range.start)
..(range.end + snippet.fragment_range.start)
})
.map(|range| &text[range])
.collect();
assert!(absolute_highlights.contains(&"quick")); // "quick"
assert!(absolute_highlights.contains(&"fox")); // "fox"
}
#[test]
fn test_collapse_overlapped_ranges() {
#![allow(clippy::single_range_in_vec_init)]