mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-19 09:16:45 +00:00
Merge a2f489799f into 1e859fd78d
This commit is contained in:
commit
4eeddca37f
2 changed files with 174 additions and 57 deletions
|
|
@ -59,8 +59,11 @@ fn main() -> tantivy::Result<()> {
|
|||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {score}:");
|
||||
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
|
||||
if let Some(snippet) = snippet {
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@
|
|||
//! # let searcher = reader.searcher();
|
||||
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
|
||||
//! snippet_generator.set_max_num_chars(100);
|
||||
//! let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
//! let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
|
||||
//! let snippet_html: String = snippet.to_html();
|
||||
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
|
||||
//! # Ok(())
|
||||
|
|
@ -115,6 +115,7 @@ impl FragmentCandidate {
|
|||
#[derive(Debug)]
|
||||
pub struct Snippet {
|
||||
fragment: String,
|
||||
fragment_range: Range<usize>,
|
||||
highlighted: Vec<Range<usize>>,
|
||||
snippet_prefix: String,
|
||||
snippet_postfix: String,
|
||||
|
|
@ -122,30 +123,16 @@ pub struct Snippet {
|
|||
|
||||
impl Snippet {
|
||||
/// Create a new `Snippet`.
|
||||
fn new(fragment: &str, highlighted: Vec<Range<usize>>) -> Self {
|
||||
fn new(source_str: &str, source_range: Range<usize>, highlighted: Vec<Range<usize>>) -> Self {
|
||||
Self {
|
||||
fragment: fragment.to_string(),
|
||||
fragment: source_str[source_range.clone()].to_string(),
|
||||
fragment_range: source_range,
|
||||
highlighted,
|
||||
snippet_prefix: DEFAULT_SNIPPET_PREFIX.to_string(),
|
||||
snippet_postfix: DEFAULT_SNIPPET_POSTFIX.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new, empty, `Snippet`.
|
||||
pub fn empty() -> Snippet {
|
||||
Snippet {
|
||||
fragment: String::new(),
|
||||
highlighted: Vec::new(),
|
||||
snippet_prefix: String::new(),
|
||||
snippet_postfix: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if the snippet is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.highlighted.len() == 0
|
||||
}
|
||||
|
||||
/// Returns a highlighted html from the `Snippet`.
|
||||
pub fn to_html(&self) -> String {
|
||||
let mut html = String::new();
|
||||
|
|
@ -169,6 +156,12 @@ impl Snippet {
|
|||
&self.fragment
|
||||
}
|
||||
|
||||
/// Returns the range of the original text that the fragment was extracted
|
||||
/// from.
|
||||
pub fn range(&self) -> Range<usize> {
|
||||
self.fragment_range.clone()
|
||||
}
|
||||
|
||||
/// Returns a list of highlighted positions from the `Snippet`.
|
||||
pub fn highlighted(&self) -> &[Range<usize>] {
|
||||
&self.highlighted
|
||||
|
|
@ -231,7 +224,10 @@ fn search_fragments(
|
|||
///
|
||||
/// Takes a vector of `FragmentCandidate`s and the text.
|
||||
/// Figures out the best fragment from it and creates a snippet.
|
||||
fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet {
|
||||
fn select_best_fragment_combination(
|
||||
fragments: &[FragmentCandidate],
|
||||
text: &str,
|
||||
) -> Option<Snippet> {
|
||||
let best_fragment_opt = fragments.iter().max_by(|left, right| {
|
||||
let cmp_score = left
|
||||
.score
|
||||
|
|
@ -243,18 +239,21 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||
cmp_score
|
||||
}
|
||||
});
|
||||
if let Some(fragment) = best_fragment_opt {
|
||||
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.collect();
|
||||
Snippet::new(fragment_text, highlighted)
|
||||
} else {
|
||||
// When there are no fragments to chose from,
|
||||
// for now create an empty snippet.
|
||||
Snippet::empty()
|
||||
match best_fragment_opt {
|
||||
Some(fragment) => {
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.collect();
|
||||
|
||||
Some(Snippet::new(
|
||||
text,
|
||||
fragment.start_offset..fragment.stop_offset,
|
||||
highlighted,
|
||||
))
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -368,7 +367,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
|
|||
/// # let searcher = reader.searcher();
|
||||
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
|
||||
/// snippet_generator.set_max_num_chars(100);
|
||||
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
/// let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
|
||||
/// let snippet_html: String = snippet.to_html();
|
||||
/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
|
||||
/// # Ok(())
|
||||
|
|
@ -441,11 +440,7 @@ impl SnippetGenerator {
|
|||
&self.terms_text
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given `Document`.
|
||||
///
|
||||
/// This method extract the text associated with the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Snippet {
|
||||
fn text_from_doc<D: Document>(&self, doc: &D) -> String {
|
||||
let mut text = String::new();
|
||||
for (field, value) in doc.iter_fields_and_values() {
|
||||
let value = value as D::Value<'_>;
|
||||
|
|
@ -459,19 +454,68 @@ impl SnippetGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
self.snippet(text.trim())
|
||||
text
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given `Document`.
|
||||
///
|
||||
/// This method extract the text associated with the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc<D: Document>(&self, doc: &D) -> Option<Snippet> {
|
||||
self.snippet(self.text_from_doc(doc).trim())
|
||||
}
|
||||
|
||||
/// Generates snippets for the given `Document`.
|
||||
///
|
||||
/// This method extract the text associated with the `SnippetGenerator`'s field
|
||||
/// and computes snippets.
|
||||
pub fn snippets_from_doc<D: Document>(&self, doc: &D) -> Vec<Snippet> {
|
||||
self.snippets(self.text_from_doc(doc).trim())
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
pub fn snippet(&self, text: &str) -> Option<Snippet> {
|
||||
let fragment_candidates = search_fragments(
|
||||
&mut self.tokenizer.clone(),
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
|
||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippets(&self, text: &str) -> Vec<Snippet> {
|
||||
let fragment_candidates = search_fragments(
|
||||
&mut self.tokenizer.clone(),
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
|
||||
let snippets = fragment_candidates
|
||||
.iter()
|
||||
.filter(|f| f.score > 0.0)
|
||||
.map(|fragment| {
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| {
|
||||
item.start - fragment.start_offset..item.end - fragment.start_offset
|
||||
})
|
||||
.collect();
|
||||
|
||||
Snippet::new(
|
||||
text,
|
||||
fragment.start_offset..fragment.stop_offset,
|
||||
highlighted,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
snippets
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -520,7 +564,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.score, 1.9);
|
||||
assert_eq!(first.stop_offset, 89);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
|
||||
assert_eq!(
|
||||
snippet.fragment,
|
||||
"Rust is a systems programming language sponsored by\nMozilla which describes it as a \
|
||||
|
|
@ -551,7 +595,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.stop_offset, 17);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
||||
}
|
||||
{
|
||||
|
|
@ -571,7 +615,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.score, 0.9);
|
||||
assert_eq!(first.stop_offset, 17);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
|
||||
assert_eq!(snippet.to_html(), "programming <b>language</b>")
|
||||
}
|
||||
}
|
||||
|
|
@ -594,7 +638,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.stop_offset, 7);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
assert_eq!(snippet.fragment, "c d");
|
||||
assert_eq!(snippet.to_html(), "<b>c</b> d");
|
||||
}
|
||||
|
|
@ -617,7 +661,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.start_offset, 8);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
assert_eq!(snippet.fragment, "e f");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b>");
|
||||
}
|
||||
|
|
@ -641,7 +685,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.start_offset, 0);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
assert_eq!(snippet.fragment, "e f g");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b> g");
|
||||
}
|
||||
|
|
@ -659,9 +703,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragment, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
assert!(snippet.is_empty());
|
||||
assert!(snippet.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -674,9 +716,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragment, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
assert!(snippet.is_empty());
|
||||
assert!(snippet.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -753,7 +793,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
let mut snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
{
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
"imperative-procedural paradigms. <b>Rust</b> is syntactically similar to \
|
||||
|
|
@ -763,7 +803,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
}
|
||||
{
|
||||
snippet_generator.set_max_num_chars(90);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
"<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its \
|
||||
|
|
@ -796,7 +836,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
assert_eq!(first.stop_offset, 3);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
assert_eq!(snippet.fragment, "abc");
|
||||
assert_eq!(snippet.to_html(), "<b>abc</b>");
|
||||
}
|
||||
|
|
@ -810,7 +850,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||
&terms,
|
||||
100,
|
||||
);
|
||||
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
"<b>Rust</b> is a systems programming <b>language</b> sponsored by\nMozilla which \
|
||||
|
|
@ -824,6 +864,80 @@ Survey in 2016, 2017, and 2018."#;
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_absolute_offsets() {
|
||||
let text = "First sentence. The quick brown fox jumps over the lazy dog. Last sentence.";
|
||||
let terms = btreemap! {
|
||||
String::from("fox") => 1.0,
|
||||
String::from("dog") => 0.9
|
||||
};
|
||||
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
|
||||
// verify fragment range points to correct substring
|
||||
// max_num_chars is 100, so our fragment should be the entire text
|
||||
assert_eq!(snippet.fragment_range, 0..text.len() - 1);
|
||||
assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
|
||||
|
||||
// verify highlighted ranges are correct relative to original text
|
||||
let absolute_highlights: Vec<&str> = snippet
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|highlight| {
|
||||
(highlight.start + snippet.fragment_range.start)
|
||||
..(highlight.end + snippet.fragment_range.start)
|
||||
})
|
||||
.map(|range| &text[range])
|
||||
.collect();
|
||||
|
||||
// "fox" and "dog" positions in original text
|
||||
assert!(absolute_highlights.contains(&"fox")); // "fox"
|
||||
assert!(absolute_highlights.contains(&"dog")); // "dog"
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_absolute_offsets_with_truncation() {
|
||||
let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox \
|
||||
jumps again. End text.";
|
||||
let terms = btreemap! {
|
||||
String::from("fox") => 1.0,
|
||||
String::from("quick") => 0.9
|
||||
};
|
||||
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
30, // short max chars to force truncation
|
||||
);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
|
||||
|
||||
// verify fragment range points to correct substring
|
||||
assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
|
||||
|
||||
// verify highlighted ranges are correct relative to original text
|
||||
let absolute_highlights: Vec<&str> = snippet
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|range| {
|
||||
(range.start + snippet.fragment_range.start)
|
||||
..(range.end + snippet.fragment_range.start)
|
||||
})
|
||||
.map(|range| &text[range])
|
||||
.collect();
|
||||
|
||||
assert!(absolute_highlights.contains(&"quick")); // "quick"
|
||||
assert!(absolute_highlights.contains(&"fox")); // "fox"
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collapse_overlapped_ranges() {
|
||||
#![allow(clippy::single_range_in_vec_init)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue