mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 23:20:40 +00:00
feat: SnippetGenerator accepts limit/offset (#66)
This commit is contained in:
@@ -74,10 +74,9 @@ const DEFAULT_SNIPPET_POSTFIX: &str = "</b>";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct FragmentCandidate {
|
||||
score: Score,
|
||||
start_offset: usize,
|
||||
stop_offset: usize,
|
||||
highlighted: Vec<Range<usize>>,
|
||||
highlighted: Vec<(Range<usize>, Score)>,
|
||||
}
|
||||
|
||||
impl FragmentCandidate {
|
||||
@@ -88,7 +87,6 @@ impl FragmentCandidate {
|
||||
/// stop_offset is set to start_offset, which is taken as a param.
|
||||
fn new(start_offset: usize) -> FragmentCandidate {
|
||||
FragmentCandidate {
|
||||
score: 0.0,
|
||||
start_offset,
|
||||
stop_offset: start_offset,
|
||||
highlighted: vec![],
|
||||
@@ -104,10 +102,22 @@ impl FragmentCandidate {
|
||||
self.stop_offset = token.offset_to;
|
||||
|
||||
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted.push(token.offset_from..token.offset_to);
|
||||
self.highlighted
|
||||
.push((token.offset_from..token.offset_to, score));
|
||||
}
|
||||
}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
self.highlighted.iter().map(|(_, score)| score).sum()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.highlighted.len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.highlighted.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// `Snippet`
|
||||
@@ -143,7 +153,7 @@ impl Snippet {
|
||||
|
||||
/// Returns `true` if the snippet is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.highlighted.len() == 0
|
||||
self.highlighted.is_empty()
|
||||
}
|
||||
|
||||
/// Returns a highlighted html from the `Snippet`.
|
||||
@@ -207,24 +217,73 @@ fn search_fragments(
|
||||
text: &str,
|
||||
terms: &BTreeMap<String, Score>,
|
||||
max_num_chars: usize,
|
||||
fragment_limit: Option<usize>,
|
||||
fragment_offset: Option<usize>,
|
||||
) -> Vec<FragmentCandidate> {
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0);
|
||||
let mut fragments: Vec<FragmentCandidate> = vec![];
|
||||
|
||||
// Process all fragments first, without applying offset/limit to token stream
|
||||
while let Some(next) = token_stream.next() {
|
||||
if (next.offset_to - fragment.start_offset) > max_num_chars {
|
||||
if fragment.score > 0.0 {
|
||||
if fragment.score() > 0.0 {
|
||||
fragments.push(fragment)
|
||||
};
|
||||
fragment = FragmentCandidate::new(next.offset_from);
|
||||
}
|
||||
|
||||
fragment.try_add_token(next, terms);
|
||||
}
|
||||
if fragment.score > 0.0 {
|
||||
if fragment.score() > 0.0 {
|
||||
fragments.push(fragment)
|
||||
}
|
||||
|
||||
fragments
|
||||
if fragment_offset.is_none() && fragment_limit.is_none() {
|
||||
return fragments;
|
||||
}
|
||||
|
||||
// Skip the first offset snippets, and take the next limit snippets
|
||||
// across all FragmentCandidates
|
||||
let offset_count = fragment_offset.unwrap_or(0);
|
||||
let limit_count = fragment_limit.unwrap_or(fragments.len());
|
||||
|
||||
let mut remaining_offset = offset_count;
|
||||
let mut remaining_limit = limit_count;
|
||||
let mut filtered_fragments = Vec::new();
|
||||
|
||||
for mut fragment in fragments {
|
||||
if remaining_limit == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
let num_snippets = fragment.len();
|
||||
|
||||
if remaining_offset >= num_snippets {
|
||||
remaining_offset -= num_snippets;
|
||||
continue;
|
||||
}
|
||||
|
||||
let skip_from_this_fragment = remaining_offset;
|
||||
let take_from_this_fragment =
|
||||
std::cmp::min(num_snippets - skip_from_this_fragment, remaining_limit);
|
||||
|
||||
fragment.highlighted = fragment
|
||||
.highlighted
|
||||
.into_iter()
|
||||
.skip(skip_from_this_fragment)
|
||||
.take(take_from_this_fragment)
|
||||
.collect();
|
||||
|
||||
remaining_offset = 0;
|
||||
remaining_limit -= take_from_this_fragment;
|
||||
|
||||
if !fragment.is_empty() {
|
||||
filtered_fragments.push(fragment);
|
||||
}
|
||||
}
|
||||
|
||||
filtered_fragments
|
||||
}
|
||||
|
||||
/// Returns a Snippet
|
||||
@@ -234,8 +293,8 @@ fn search_fragments(
|
||||
fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet {
|
||||
let best_fragment_opt = fragments.iter().max_by(|left, right| {
|
||||
let cmp_score = left
|
||||
.score
|
||||
.partial_cmp(&right.score)
|
||||
.score()
|
||||
.partial_cmp(&right.score())
|
||||
.unwrap_or(Ordering::Equal);
|
||||
if cmp_score == Ordering::Equal {
|
||||
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
|
||||
@@ -248,7 +307,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.map(|(item, _)| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.collect();
|
||||
Snippet::new(fragment_text, highlighted)
|
||||
} else {
|
||||
@@ -379,6 +438,8 @@ pub struct SnippetGenerator {
|
||||
tokenizer: TextAnalyzer,
|
||||
field: Field,
|
||||
max_num_chars: usize,
|
||||
fragment_limit: Option<usize>,
|
||||
fragment_offset: Option<usize>,
|
||||
}
|
||||
|
||||
impl SnippetGenerator {
|
||||
@@ -394,8 +455,19 @@ impl SnippetGenerator {
|
||||
tokenizer,
|
||||
field,
|
||||
max_num_chars,
|
||||
fragment_limit: None,
|
||||
fragment_offset: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_limit(&mut self, limit: usize) {
|
||||
self.fragment_limit = Some(limit);
|
||||
}
|
||||
|
||||
pub fn set_offset(&mut self, offset: usize) {
|
||||
self.fragment_offset = Some(offset);
|
||||
}
|
||||
|
||||
/// Creates a new snippet generator
|
||||
pub fn create(
|
||||
searcher: &Searcher,
|
||||
@@ -438,6 +510,8 @@ impl SnippetGenerator {
|
||||
tokenizer,
|
||||
field,
|
||||
max_num_chars: DEFAULT_MAX_NUM_CHARS,
|
||||
fragment_limit: None,
|
||||
fragment_offset: None,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -479,6 +553,8 @@ impl SnippetGenerator {
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
self.fragment_limit,
|
||||
self.fragment_offset,
|
||||
);
|
||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
||||
}
|
||||
@@ -523,11 +599,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.9);
|
||||
assert_eq!(first.score(), 1.9);
|
||||
assert_eq!(first.stop_offset, 89);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
@@ -555,10 +633,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.score(), 1.0);
|
||||
assert_eq!(first.stop_offset, 17);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
@@ -574,11 +654,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
// assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 0.9);
|
||||
assert_eq!(first.score(), 0.9);
|
||||
assert_eq!(first.stop_offset, 17);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
@@ -593,13 +675,19 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.score(), 1.0);
|
||||
assert_eq!(first.start_offset, 4);
|
||||
assert_eq!(first.stop_offset, 7);
|
||||
}
|
||||
@@ -616,13 +704,19 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.score(), 1.0);
|
||||
assert_eq!(first.stop_offset, 11);
|
||||
assert_eq!(first.start_offset, 8);
|
||||
}
|
||||
@@ -640,13 +734,19 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 7);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
7,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 0.9);
|
||||
assert_eq!(first.score(), 0.9);
|
||||
assert_eq!(first.stop_offset, 7);
|
||||
assert_eq!(first.start_offset, 0);
|
||||
}
|
||||
@@ -663,8 +763,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -679,8 +785,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
@@ -794,12 +906,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.9);
|
||||
assert_eq!(first.score(), 1.9);
|
||||
assert_eq!(first.start_offset, 0);
|
||||
assert_eq!(first.stop_offset, 3);
|
||||
}
|
||||
@@ -817,6 +931,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
assert_eq!(
|
||||
@@ -832,6 +948,44 @@ Survey in 2016, 2017, and 2018."#;
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_limit_and_offset() {
|
||||
let terms = btreemap! {
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9,
|
||||
};
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
Some(2),
|
||||
Some(1),
|
||||
);
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score(), 0.9);
|
||||
assert_eq!(first.stop_offset, 89);
|
||||
}
|
||||
{
|
||||
let second = &fragments[1];
|
||||
assert_eq!(second.score(), 0.9);
|
||||
assert_eq!(second.stop_offset, 190);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
assert_eq!(
|
||||
snippet.fragment,
|
||||
"Rust is a systems programming language sponsored by\nMozilla which describes it as a \
|
||||
\"safe"
|
||||
);
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
"Rust is a systems programming <b>language</b> sponsored by\nMozilla which describes \
|
||||
it as a "safe"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collapse_overlapped_ranges() {
|
||||
#![allow(clippy::single_range_in_vec_init)]
|
||||
|
||||
Reference in New Issue
Block a user