From 19756bb7d6ae9414468059943edc6d8c0f45600c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 15 Aug 2018 10:52:50 +0900 Subject: [PATCH 01/20] Getting started on #368 --- src/lib.rs | 2 ++ src/snippet/mod.rs | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/snippet/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 985d68a84..0d64752d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -209,6 +209,8 @@ pub mod schema; pub mod store; pub mod termdict; +mod snippet; + mod docset; pub use self::docset::{DocSet, SkipResult}; diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs new file mode 100644 index 000000000..4cc1e41b9 --- /dev/null +++ b/src/snippet/mod.rs @@ -0,0 +1,88 @@ +use tokenizer::{TokenStream, Tokenizer}; +use std::collections::BTreeMap; +use Term; +use Document; +use Index; +use schema::FieldValue; +use schema::Value; +use tokenizer::BoxedTokenizer; + +pub struct HighlightSection { + start: usize, + stop: usize, +} + +impl HighlightSection { + fn new(start: usize, stop: usize) -> HighlightSection { + HighlightSection { + start, + stop + } + } +} + +pub struct FragmentCandidate { + score: f32, + start_offset: usize, + stop_offset: usize, + num_chars: usize, + highlighted: Vec, +} + +pub struct Snippet { + fragments: Vec, +} + +impl Snippet { + pub fn to_html() -> String { + unimplemented!(); + } +} + +/// Returns a non-empty list of "good" fragments. +/// +/// If no target term is within the text, then the function +/// should return an empty Vec. +/// +/// If a target term is within the text, then the returned +/// list is required to be non-empty. +/// +/// The returned list is non-empty and contain less +/// than 12 possibly overlapping fragments. +/// +/// All fragments should contain at least one target term +/// and have at most `max_num_chars` characters (not bytes). +/// +/// It is ok to emit non-overlapping fragments, for instance, +/// one short and one long containing the same keyword, in order +/// to leave optimization opportunity to the fragment selector +/// upstream. +/// +/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ +/// has to be a valid string. +fn search_fragments<'a>( + tokenizer: &BoxedTokenizer, + text: &'a str, + terms: BTreeMap, + max_num_chars: usize) -> Vec { + unimplemented!(); +} + +fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec)>, max_num_chars: usize) -> Snippet { + unimplemented!(); +} + +pub fn generate_snippet<'a>( + doc: &'a [FieldValue], + index: &Index, + terms: Vec, + max_num_chars: usize) -> Snippet { + unimplemented!(); +} + + +#[cfg(test)] +mod tests { + #[test] + fn test_snippet() {} +} \ No newline at end of file From 835cdc2fe8f5ecceaaa1c65619af96fd8114b3c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Mon, 27 Aug 2018 22:14:59 +0530 Subject: [PATCH 02/20] Initial version of snippet refer #368 --- src/snippet/mod.rs | 141 ++++++++++++++++++++++++++++++++++--- src/tokenizer/mod.rs | 2 +- src/tokenizer/tokenizer.rs | 2 +- 3 files changed, 132 insertions(+), 13 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4cc1e41b9..4356e0a80 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,4 +1,4 @@ -use tokenizer::{TokenStream, Tokenizer}; +use tokenizer::{TokenStream, Tokenizer, Token}; use std::collections::BTreeMap; use Term; use Document; @@ -7,6 +7,7 @@ use schema::FieldValue; use schema::Value; use tokenizer::BoxedTokenizer; +#[derive(Debug)] pub struct HighlightSection { start: usize, stop: usize, @@ -21,6 +22,7 @@ impl HighlightSection { } } +#[derive(Debug)] pub struct FragmentCandidate { score: f32, start_offset: usize, @@ -29,13 +31,53 @@ pub struct FragmentCandidate { highlighted: Vec, } -pub struct Snippet { - fragments: Vec, +impl FragmentCandidate { + + fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate { + FragmentCandidate{score: 0.0, + start_offset: start_offset, + stop_offset: end_offset, + num_chars: 0, + highlighted: vec![]} + } + + /// Updates `score` and `highlighted` fields of the objects. + /// + /// + fn calculate_score(&mut self, token: &Token, terms: &BTreeMap) { + if let Some(score) = terms.get(&token.text.to_lowercase()) { + self.score += score; + self.highlighted.push(HighlightSection{start: token.offset_from, + stop: token.offset_to}); + } + } } +#[derive(Debug)] +pub struct Snippet { + fragments: String, + highlighted: Vec, +} + +const HIGHLIGHTEN_PREFIX:&str = ""; +const HIGHLIGHTEN_POSTFIX:&str = ""; + impl Snippet { - pub fn to_html() -> String { - unimplemented!(); + + /// Returns a hignlightned html from the `Snippet`. + pub fn to_html(&self) -> String { + let mut html = String::new(); + let mut start_from: usize = 0; + + for item in self.highlighted.iter() { + html.push_str(&self.fragments[start_from..item.start]); + html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&self.fragments[item.start..item.stop]); + html.push_str(HIGHLIGHTEN_POSTFIX); + start_from = item.stop; + } + html.push_str(&self.fragments[start_from..self.fragments.len()]); + html } } @@ -61,15 +103,61 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: &BoxedTokenizer, + tokenizer: Box, text: &'a str, terms: BTreeMap, max_num_chars: usize) -> Vec { - unimplemented!(); + let mut token_stream = tokenizer.token_stream(text); + let mut fragment = FragmentCandidate::new(0, 0); + let mut fragments:Vec = vec![]; + + loop { + if let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + } else { + fragment.calculate_score(next, &terms); + fragment.stop_offset = next.offset_to; + } + } else { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + break; + } + } + + fragments } -fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec)>, max_num_chars: usize) -> Snippet { - unimplemented!(); +/// Returns a Snippet +/// +/// Takes a vector of `FragmentCandidate`s and the text. +/// Figures out the best fragment from it and creates a snippet. +fn select_best_fragment_combination<'a>(fragments: Vec, + text: &'a str,) -> Snippet { + if let Some(init) = fragments.iter().nth(0) { + let fragment = fragments.iter().skip(1).fold(init, |acc, item| { + if item.score > init.score { item } else { init } + }); + let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; + let highlighted = fragment.highlighted.iter().map(|item| { + HighlightSection{start: item.start-fragment.start_offset, + stop: item.stop-fragment.start_offset} + }).collect(); + Snippet{fragments: fragment_text.to_owned(), + highlighted: highlighted} + } else { + // when there no fragments to chose from, + // for now create a empty snippet + Snippet{fragments: String::new(), + highlighted: vec![]} + } } pub fn generate_snippet<'a>( @@ -83,6 +171,37 @@ pub fn generate_snippet<'a>( #[cfg(test)] mod tests { + use tokenizer::{SimpleTokenizer, box_tokenizer}; + use std::iter::Iterator; + use std::collections::BTreeMap; + use super::{search_fragments, select_best_fragment_combination}; + #[test] - fn test_snippet() {} -} \ No newline at end of file + fn test_snippet() { + let tokenizer = SimpleTokenizer; + + let t = box_tokenizer(tokenizer); + + let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. + +Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] + +Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. +"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("rust"), 1.0); + terms.insert(String::from("language"), 0.9); + + let fragments = search_fragments(t, &text, terms, 100); + assert_eq!(fragments.len(), 7); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.9); + assert_eq!(first.stop_offset, 89); + } + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()) + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..d4a735bd2 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,7 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::BoxedTokenizer; +pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index fcdf8f21b..e806b70d8 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub(crate) fn box_tokenizer(a: A) -> Box +pub fn box_tokenizer(a: A) -> Box where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, { From 46decdb0ea60cc7c2e274b6bfaed180d1ecad0dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Mon, 27 Aug 2018 22:16:47 +0530 Subject: [PATCH 03/20] compare against accumulator rather than init value --- src/snippet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4356e0a80..2429ac2e0 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -143,7 +143,7 @@ fn select_best_fragment_combination<'a>(fragments: Vec, text: &'a str,) -> Snippet { if let Some(init) = fragments.iter().nth(0) { let fragment = fragments.iter().skip(1).fold(init, |acc, item| { - if item.score > init.score { item } else { init } + if item.score > acc.score { item } else { acc } }); let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; let highlighted = fragment.highlighted.iter().map(|item| { From b373f0084089cfd75b0967a5ef90d8feb0a6f5ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:06:09 +0530 Subject: [PATCH 04/20] add htmlescape and update to_html fn to use it. tests and imports also updated. --- Cargo.toml | 1 + src/lib.rs | 1 + src/snippet/mod.rs | 9 +++++---- 3 files changed, 7 insertions(+), 4 deletions(-) mode change 100644 => 100755 src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index ab767d3fd..1ec1b65d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ census = "0.1" fnv = "1.0.6" owned-read = "0.4" failure = "0.1" +htmlescape = "0.3.1" [target.'cfg(windows)'.dependencies] winapi = "0.2" diff --git a/src/lib.rs b/src/lib.rs old mode 100644 new mode 100755 index 0d64752d9..4f4d364a0 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,6 +154,7 @@ extern crate stable_deref_trait; extern crate tempdir; extern crate tempfile; extern crate uuid; +extern crate htmlescape; #[cfg(test)] #[macro_use] diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 2429ac2e0..7413b8bb8 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -6,6 +6,7 @@ use Index; use schema::FieldValue; use schema::Value; use tokenizer::BoxedTokenizer; +use htmlescape::encode_minimal; #[derive(Debug)] pub struct HighlightSection { @@ -70,13 +71,13 @@ impl Snippet { let mut start_from: usize = 0; for item in self.highlighted.iter() { - html.push_str(&self.fragments[start_from..item.start]); + html.push_str(&encode_minimal(&self.fragments[start_from..item.start])); html.push_str(HIGHLIGHTEN_PREFIX); - html.push_str(&self.fragments[item.start..item.stop]); + html.push_str(&encode_minimal(&self.fragments[item.start..item.stop])); html.push_str(HIGHLIGHTEN_POSTFIX); start_from = item.stop; } - html.push_str(&self.fragments[start_from..self.fragments.len()]); + html.push_str(&encode_minimal(&self.fragments[start_from..self.fragments.len()])); html } } @@ -202,6 +203,6 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl } let snippet = select_best_fragment_combination(fragments, &text); assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()) + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) } } From 8438eda01a05792a29c22b2f1771285659bd25b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:11:50 +0530 Subject: [PATCH 05/20] use while let instead of loop and if. as per CR comment --- src/snippet/mod.rs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 7413b8bb8..a8a7bb194 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -112,26 +112,20 @@ fn search_fragments<'a>( let mut fragment = FragmentCandidate::new(0, 0); let mut fragments:Vec = vec![]; - loop { - if let Some(next) = token_stream.next() { - if (next.offset_to - fragment.start_offset) > max_num_chars { - let txt = &text[fragment.start_offset..fragment.stop_offset]; - if fragment.score > 0.0 { - fragments.push(fragment) - }; - fragment = FragmentCandidate::new(next.offset_from, next.offset_to); - } else { - fragment.calculate_score(next, &terms); - fragment.stop_offset = next.offset_to; - } - } else { - let txt = &text[fragment.start_offset..fragment.stop_offset]; + while let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { if fragment.score > 0.0 { fragments.push(fragment) }; - break; + fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + } else { + fragment.calculate_score(next, &terms); + fragment.stop_offset = next.offset_to; } } + if fragment.score > 0.0 { + fragments.push(fragment) + } fragments } From e1bca6db9d80e9609eefdd6b2f66ff729d80bcfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:24:25 +0530 Subject: [PATCH 06/20] update `calculate_score` to `try_add_token` `try_add_token` will now update the stop_offset as well. `FragmentCandidate::new` now just takes `start_offset`, it expects `try_add_token` to be called to add a token. --- src/snippet/mod.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index a8a7bb194..64d661acb 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -34,18 +34,27 @@ pub struct FragmentCandidate { impl FragmentCandidate { - fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate { + /// Create a basic `FragmentCandidate` + /// + /// `score`, `num_chars` are set to 0 + /// and `highlighted` is set to empty vec + /// stop_offset is set to start_offset, which is taken as a param. + fn new(start_offset: usize) -> FragmentCandidate { FragmentCandidate{score: 0.0, start_offset: start_offset, - stop_offset: end_offset, + stop_offset: start_offset, num_chars: 0, highlighted: vec![]} } /// Updates `score` and `highlighted` fields of the objects. /// - /// - fn calculate_score(&mut self, token: &Token, terms: &BTreeMap) { + /// taking the token and terms, the token is added to the fragment. + /// if the token is one of the terms, the score + /// and highlighted fields are updated in the fragment. + fn try_add_token(&mut self, token: &Token, terms: &BTreeMap) { + self.stop_offset = token.offset_to; + if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; self.highlighted.push(HighlightSection{start: token.offset_from, @@ -109,7 +118,7 @@ fn search_fragments<'a>( terms: BTreeMap, max_num_chars: usize) -> Vec { let mut token_stream = tokenizer.token_stream(text); - let mut fragment = FragmentCandidate::new(0, 0); + let mut fragment = FragmentCandidate::new(0); let mut fragments:Vec = vec![]; while let Some(next) = token_stream.next() { @@ -117,10 +126,9 @@ fn search_fragments<'a>( if fragment.score > 0.0 { fragments.push(fragment) }; - fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + fragment = FragmentCandidate::new(next.offset_from); } else { - fragment.calculate_score(next, &terms); - fragment.stop_offset = next.offset_to; + fragment.try_add_token(next, &terms); } } if fragment.score > 0.0 { From fb9b1c1f41549e889f33c99cbee9d585b5ef555c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:40:12 +0530 Subject: [PATCH 07/20] add a test and fix the bug of not calculating first token --- src/snippet/mod.rs | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 64d661acb..8f94a0a40 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -127,9 +127,8 @@ fn search_fragments<'a>( fragments.push(fragment) }; fragment = FragmentCandidate::new(next.offset_from); - } else { - fragment.try_add_token(next, &terms); } + fragment.try_add_token(next, &terms); } if fragment.score > 0.0 { fragments.push(fragment) @@ -183,7 +182,7 @@ mod tests { fn test_snippet() { let tokenizer = SimpleTokenizer; - let t = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(tokenizer); let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. @@ -196,7 +195,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - let fragments = search_fragments(t, &text, terms, 100); + let fragments = search_fragments(boxed_tokenizer, &text, terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); @@ -207,4 +206,30 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) } + + #[test] + fn test_snippet_in_second_fragment() { + let tokenizer = SimpleTokenizer; + + let boxed_tokenizer = box_tokenizer(tokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("c"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 1); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.start_offset, 4); + assert_eq!(first.stop_offset, 6); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "c d"); + assert_eq!(snippet.to_html(), "c d"); + } } From 96a313c6dd2540b6620f5285a15a8d250dae0403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 20:26:45 +0530 Subject: [PATCH 08/20] add more tests --- src/snippet/mod.rs | 68 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8f94a0a40..344c44d82 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -178,11 +178,11 @@ mod tests { use std::collections::BTreeMap; use super::{search_fragments, select_best_fragment_combination}; + const TOKENIZER:SimpleTokenizer = SimpleTokenizer; + #[test] fn test_snippet() { - let tokenizer = SimpleTokenizer; - - let boxed_tokenizer = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(TOKENIZER); let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. @@ -209,9 +209,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_in_second_fragment() { - let tokenizer = SimpleTokenizer; - - let boxed_tokenizer = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(TOKENIZER); let text = "a b c d e f g"; @@ -225,11 +223,67 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let first = fragments.iter().nth(0).unwrap(); assert_eq!(first.score, 1.0); assert_eq!(first.start_offset, 4); - assert_eq!(first.stop_offset, 6); + assert_eq!(first.stop_offset, 7); } let snippet = select_best_fragment_combination(fragments, &text); assert_eq!(snippet.fragments, "c d"); assert_eq!(snippet.to_html(), "c d"); } + + #[test] + fn test_snippet_with_term_at_the_end_of_fragment() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d e f f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 11); + assert_eq!(first.start_offset, 8); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f"); + assert_eq!(snippet.to_html(), "e f"); + } + + #[test] + fn test_snippet_with_term_not_in_text() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("z"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + fn test_snippet_with_no_terms() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } } From 6a197e023e3ec77f4b4a52bdbe75e8005ca29f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 20:34:55 +0530 Subject: [PATCH 09/20] ran rustfmt --- src/snippet/mod.rs | 105 ++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 344c44d82..26b6be0f6 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,12 +1,12 @@ -use tokenizer::{TokenStream, Tokenizer, Token}; -use std::collections::BTreeMap; -use Term; -use Document; -use Index; +use htmlescape::encode_minimal; use schema::FieldValue; use schema::Value; +use std::collections::BTreeMap; use tokenizer::BoxedTokenizer; -use htmlescape::encode_minimal; +use tokenizer::{Token, TokenStream, Tokenizer}; +use Document; +use Index; +use Term; #[derive(Debug)] pub struct HighlightSection { @@ -16,10 +16,7 @@ pub struct HighlightSection { impl HighlightSection { fn new(start: usize, stop: usize) -> HighlightSection { - HighlightSection { - start, - stop - } + HighlightSection { start, stop } } } @@ -33,18 +30,19 @@ pub struct FragmentCandidate { } impl FragmentCandidate { - /// Create a basic `FragmentCandidate` /// /// `score`, `num_chars` are set to 0 /// and `highlighted` is set to empty vec /// stop_offset is set to start_offset, which is taken as a param. fn new(start_offset: usize) -> FragmentCandidate { - FragmentCandidate{score: 0.0, - start_offset: start_offset, - stop_offset: start_offset, - num_chars: 0, - highlighted: vec![]} + FragmentCandidate { + score: 0.0, + start_offset: start_offset, + stop_offset: start_offset, + num_chars: 0, + highlighted: vec![], + } } /// Updates `score` and `highlighted` fields of the objects. @@ -57,8 +55,10 @@ impl FragmentCandidate { if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; - self.highlighted.push(HighlightSection{start: token.offset_from, - stop: token.offset_to}); + self.highlighted.push(HighlightSection { + start: token.offset_from, + stop: token.offset_to, + }); } } } @@ -69,11 +69,10 @@ pub struct Snippet { highlighted: Vec, } -const HIGHLIGHTEN_PREFIX:&str = ""; -const HIGHLIGHTEN_POSTFIX:&str = ""; +const HIGHLIGHTEN_PREFIX: &str = ""; +const HIGHLIGHTEN_POSTFIX: &str = ""; impl Snippet { - /// Returns a hignlightned html from the `Snippet`. pub fn to_html(&self) -> String { let mut html = String::new(); @@ -86,7 +85,9 @@ impl Snippet { html.push_str(HIGHLIGHTEN_POSTFIX); start_from = item.stop; } - html.push_str(&encode_minimal(&self.fragments[start_from..self.fragments.len()])); + html.push_str(&encode_minimal( + &self.fragments[start_from..self.fragments.len()], + )); html } } @@ -116,10 +117,11 @@ fn search_fragments<'a>( tokenizer: Box, text: &'a str, terms: BTreeMap, - max_num_chars: usize) -> Vec { + max_num_chars: usize, +) -> Vec { let mut token_stream = tokenizer.token_stream(text); let mut fragment = FragmentCandidate::new(0); - let mut fragments:Vec = vec![]; + let mut fragments: Vec = vec![]; while let Some(next) = token_stream.next() { if (next.offset_to - fragment.start_offset) > max_num_chars { @@ -141,24 +143,41 @@ fn search_fragments<'a>( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination<'a>(fragments: Vec, - text: &'a str,) -> Snippet { +fn select_best_fragment_combination<'a>( + fragments: Vec, + text: &'a str, +) -> Snippet { if let Some(init) = fragments.iter().nth(0) { - let fragment = fragments.iter().skip(1).fold(init, |acc, item| { - if item.score > acc.score { item } else { acc } - }); + let fragment = + fragments.iter().skip(1).fold( + init, + |acc, item| { + if item.score > acc.score { + item + } else { + acc + } + }, + ); let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; - let highlighted = fragment.highlighted.iter().map(|item| { - HighlightSection{start: item.start-fragment.start_offset, - stop: item.stop-fragment.start_offset} - }).collect(); - Snippet{fragments: fragment_text.to_owned(), - highlighted: highlighted} + let highlighted = fragment + .highlighted + .iter() + .map(|item| HighlightSection { + start: item.start - fragment.start_offset, + stop: item.stop - fragment.start_offset, + }).collect(); + Snippet { + fragments: fragment_text.to_owned(), + highlighted: highlighted, + } } else { // when there no fragments to chose from, // for now create a empty snippet - Snippet{fragments: String::new(), - highlighted: vec![]} + Snippet { + fragments: String::new(), + highlighted: vec![], + } } } @@ -166,19 +185,19 @@ pub fn generate_snippet<'a>( doc: &'a [FieldValue], index: &Index, terms: Vec, - max_num_chars: usize) -> Snippet { + max_num_chars: usize, +) -> Snippet { unimplemented!(); } - #[cfg(test)] mod tests { - use tokenizer::{SimpleTokenizer, box_tokenizer}; - use std::iter::Iterator; - use std::collections::BTreeMap; use super::{search_fragments, select_best_fragment_combination}; + use std::collections::BTreeMap; + use std::iter::Iterator; + use tokenizer::{box_tokenizer, SimpleTokenizer}; - const TOKENIZER:SimpleTokenizer = SimpleTokenizer; + const TOKENIZER: SimpleTokenizer = SimpleTokenizer; #[test] fn test_snippet() { From f247935bb9a84abc7ffa5ee60756501d8bb6a3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 22:16:22 +0530 Subject: [PATCH 10/20] Use HighlightSection::new rather than just directly creating the object --- src/snippet/mod.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 26b6be0f6..cee3e79ab 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -55,10 +55,8 @@ impl FragmentCandidate { if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; - self.highlighted.push(HighlightSection { - start: token.offset_from, - stop: token.offset_to, - }); + self.highlighted + .push(HighlightSection::new(token.offset_from, token.offset_to)); } } } @@ -163,9 +161,11 @@ fn select_best_fragment_combination<'a>( let highlighted = fragment .highlighted .iter() - .map(|item| HighlightSection { - start: item.start - fragment.start_offset, - stop: item.stop - fragment.start_offset, + .map(|item| { + HighlightSection::new( + item.start - fragment.start_offset, + item.stop - fragment.start_offset, + ) }).collect(); Snippet { fragments: fragment_text.to_owned(), From 18814ba0c15e72dd2db09c589e647b863dbbea51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 22:27:56 +0530 Subject: [PATCH 11/20] add a test for second fragment having higher score --- src/snippet/mod.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cee3e79ab..8142c54a0 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -274,6 +274,31 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.to_html(), "e f"); } + #[test] + fn test_snippet_with_second_fragment_has_the_highest_score() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + terms.insert(String::from("a"), 0.9); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 7); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 7); + assert_eq!(first.start_offset, 0); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f g"); + assert_eq!(snippet.to_html(), "e f g"); + } + #[test] fn test_snippet_with_term_not_in_text() { let boxed_tokenizer = box_tokenizer(TOKENIZER); @@ -292,6 +317,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.to_html(), ""); } + #[test] fn test_snippet_with_no_terms() { let boxed_tokenizer = box_tokenizer(TOKENIZER); From a12d211330657931de2c972030504762cdbb8432 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 30 Aug 2018 09:23:34 +0900 Subject: [PATCH 12/20] Extracting terms matching query in the document --- examples/snippet.rs | 75 +++++++++++++++++++++++++++++ src/common/mod.rs | 21 +++++++- src/query/automaton_weight.rs | 47 ++++++++++++++++++ src/query/term_query/term_weight.rs | 22 +++++++++ src/query/weight.rs | 35 ++++++++++++++ src/snippet/mod.rs | 6 +-- 6 files changed, 201 insertions(+), 5 deletions(-) create mode 100644 examples/snippet.rs diff --git a/examples/snippet.rs b/examples/snippet.rs new file mode 100644 index 000000000..35e9e76bd --- /dev/null +++ b/examples/snippet.rs @@ -0,0 +1,75 @@ +// # Snippet example +// +// This example shows how to return a representative snippet of +// your hit result. +// Snippet are an extracted of a target document, and returned in HTML format. +// The keyword searched by the user are highlighted with a `` tag. +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_example_dir")?; + + // # Defining the schema + let mut schema_builder = SchemaBuilder::default(); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let mut old_man_doc = Document::default(); + // we'll only need one doc for this example. + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + // ... + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + let query = query_parser.parse_query("sycamore spring")?; + + let mut top_collector = TopCollector::with_limit(10); + + searcher.search(&*query, &mut top_collector)?; + + let doc_addresses = top_collector.docs(); + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + generate_snippet(&retrieved_doc, query + } + + + Ok(()) +} + + +use tempdir::TempDir; diff --git a/src/common/mod.rs b/src/common/mod.rs index 2942438b4..778f0476a 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,6 +68,17 @@ pub trait HasLen { } } + +pub fn is_stricly_sorted(els: &[T]) -> bool { + if els.is_empty() { + true + } else { + els.iter() + .zip(els[1..].iter()) + .all(|(left, right)| left < right) + } +} + const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -105,12 +116,20 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } + + #[test] + fn test_is_strictly_sorted() { + assert!(is_stricly_sorted::(&[])); + assert!(is_stricly_sorted(&[1])); + assert!(is_stricly_sorted(&[1, 2, 3])); + assert!(!is_stricly_sorted(&[1, 3, 2])); + } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index b38e6592d..d1040eb85 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,6 +7,11 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::weight::MatchingTerms; +use SkipResult; +use Term; +use DocId; +use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight @@ -36,6 +41,48 @@ impl Weight for AutomatonWeight where A: Automaton, { + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let max_doc = reader.max_doc(); + let mut doc_bitset = BitSet::with_max_value(max_doc); + + let inverted_index = reader.inverted_index(self.field); + let term_dict = inverted_index.terms(); + let mut term_stream = self.automaton_stream(term_dict); + + let doc_ids = matching_terms.sorted_doc_ids(); + let mut docs_matching_current_term: Vec = vec![]; + + let mut term_buffer: Vec = vec![]; + + while term_stream.advance() { + docs_matching_current_term.clear(); + let term_info = term_stream.value(); + let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); + for &doc_id in &doc_ids { + match segment_postings.skip_next(doc_id) { + SkipResult::Reached => { + docs_matching_current_term.push(doc_id); + } + SkipResult::OverStep => {} + SkipResult::End => {} + } + } + if !docs_matching_current_term.is_empty() { + term_buffer.clear(); + let term_ord = term_stream.term_ord(); + inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); + let term = Term::from_field_bytes(self.field, &term_buffer[..]); + for &doc_id in &docs_matching_current_term { + matching_terms.add_term(doc_id, term.clone()); + } + } + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index ba45a8042..1a9075b5a 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,6 +8,8 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; +use SkipResult; +use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -38,6 +40,26 @@ impl Weight for TermWeight { } } + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let doc_ids = matching_terms.sorted_doc_ids(); + let mut scorer = self.scorer(reader)?; + for doc_id in doc_ids { + match scorer.skip_next(doc_id) { + SkipResult::Reached => { + matching_terms.add_term(doc_id, self.term.clone()); + } + SkipResult::OverStep => {} + SkipResult::End => { + break; + } + } + } + Ok(()) + } + fn count(&self, reader: &SegmentReader) -> Result { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index d3d8b3520..51289c573 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,6 +1,37 @@ use super::Scorer; use core::SegmentReader; use Result; +use DocId; +use std::collections::HashSet; +use Term; +use std::collections::BTreeMap; + + +pub struct MatchingTerms { + doc_to_terms: BTreeMap> +} + +impl MatchingTerms { + pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { + MatchingTerms { + doc_to_terms: doc_ids + .iter() + .cloned() + .map(|doc_id| (doc_id, HashSet::default())) + .collect() + } + } + + pub fn sorted_doc_ids(&self) -> Vec { + self.doc_to_terms.keys().cloned().collect() + } + + pub fn add_term(&mut self, doc_id: DocId, term: Term) { + if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { + terms.insert(term); + } + } +} /// A Weight is the specialization of a Query /// for a given set of segments. @@ -11,6 +42,10 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + Ok(()) + } + /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result { Ok(self.scorer(reader)?.count()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8142c54a0..97c557e98 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,10 +1,8 @@ -use htmlescape::encode_minimal; + use htmlescape::encode_minimal; use schema::FieldValue; -use schema::Value; use std::collections::BTreeMap; use tokenizer::BoxedTokenizer; -use tokenizer::{Token, TokenStream, Tokenizer}; -use Document; +use tokenizer::{Token, TokenStream}; use Index; use Term; From 6704ab69877154a12f0b1f74a27a5cd3cdc894eb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 30 Aug 2018 09:47:19 +0900 Subject: [PATCH 13/20] Added methods to extract the matching terms. First stab --- src/query/mod.rs | 1 + src/query/query.rs | 2 ++ src/query/weight.rs | 1 - src/snippet/mod.rs | 32 ++++++++++++++++++++++++++++---- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 7546465fb..0b6ee2adb 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -27,6 +27,7 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; +pub use self::weight::MatchingTerms; pub use self::intersection::Intersection; pub use self::union::Union; diff --git a/src/query/query.rs b/src/query/query.rs index 51e068b92..7004768e4 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,6 +5,8 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; +use DocAddress; +use query::weight::MatchingTerms; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. diff --git a/src/query/weight.rs b/src/query/weight.rs index 51289c573..5b603ab1c 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -6,7 +6,6 @@ use std::collections::HashSet; use Term; use std::collections::BTreeMap; - pub struct MatchingTerms { doc_to_terms: BTreeMap> } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 97c557e98..984c1a589 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,10 +1,17 @@ - use htmlescape::encode_minimal; +use htmlescape::encode_minimal; use schema::FieldValue; use std::collections::BTreeMap; +use itertools::Itertools; use tokenizer::BoxedTokenizer; use tokenizer::{Token, TokenStream}; use Index; +use Result; use Term; +use query::Query; +use DocAddress; +use DocId; +use Searcher; +use query::MatchingTerms; #[derive(Debug)] pub struct HighlightSection { @@ -179,12 +186,29 @@ fn select_best_fragment_combination<'a>( } } + + + +fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> { + let weight = query.weight(searcher, false)?; + let mut doc_groups = doc_addresses + .iter() + .group_by(|doc_address| doc_address.0); + for (segment_ord, doc_addrs) in doc_groups.into_iter() { + let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); + let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); + let segment_reader = searcher.segment_reader(segment_ord); + weight.matching_terms(segment_reader, &mut matching_terms)?; + } + Ok(()) +} + pub fn generate_snippet<'a>( - doc: &'a [FieldValue], + doc: &'a [DocAddress], index: &Index, + query: &Query, terms: Vec, - max_num_chars: usize, -) -> Snippet { + max_num_chars: usize) -> Snippet { unimplemented!(); } From f570fe37d491a9c5f669f45316dc8cceeb05bfe4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 31 Aug 2018 09:03:44 +0900 Subject: [PATCH 14/20] small changes --- examples/snippet.rs | 3 +-- src/query/automaton_weight.rs | 1 - src/query/boolean_query/boolean_weight.rs | 9 +++++++++ src/query/phrase_query/phrase_weight.rs | 6 ++++++ src/query/range_query.rs | 6 ++++++ src/snippet/mod.rs | 3 ++- 6 files changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 35e9e76bd..4efea1e5a 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -64,10 +64,9 @@ fn main() -> tantivy::Result<()> { for doc_address in doc_addresses { let retrieved_doc = searcher.doc(&doc_address)?; - generate_snippet(&retrieved_doc, query + // generate_snippet(&retrieved_doc, query } - Ok(()) } diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index d1040eb85..54f8c5f8b 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -41,7 +41,6 @@ impl Weight for AutomatonWeight where A: Automaton, { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 575bc2991..2b3348a21 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -13,6 +13,7 @@ use query::Weight; use std::borrow::Borrow; use std::collections::HashMap; use Result; +use query::MatchingTerms; fn scorer_union(scorers: Vec>) -> Box where @@ -107,6 +108,14 @@ impl BooleanWeight { } impl Weight for BooleanWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + for (_, weight) in &self.weights { + weight.matching_terms(reader, matching_terms)?; + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index de8eeb0d2..fbf43db20 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -7,6 +7,7 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; +use query::MatchingTerms; pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, @@ -30,6 +31,11 @@ impl PhraseWeight { } impl Weight for PhraseWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + unimplemented!(); + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 23efe1995..2b22e7cf8 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,6 +11,7 @@ use std::collections::Bound; use std::ops::Range; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::MatchingTerms; fn map_bound TTo>( bound: &Bound, @@ -274,6 +275,11 @@ impl RangeWeight { } impl Weight for RangeWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + unimplemented!(); + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 984c1a589..cd194e0d8 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -200,6 +200,7 @@ fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddres let segment_reader = searcher.segment_reader(segment_ord); weight.matching_terms(segment_reader, &mut matching_terms)?; } + let terms = HashSet<(DocId, Vec)>; Ok(()) } @@ -209,7 +210,7 @@ pub fn generate_snippet<'a>( query: &Query, terms: Vec, max_num_chars: usize) -> Snippet { - unimplemented!(); + search_fragments(boxed_tokenizer, &text, terms, 3); } #[cfg(test)] From 9101bf575343926256830ddfd9aa1b80004ab637 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 7 Sep 2018 09:57:12 +0900 Subject: [PATCH 15/20] Fragments --- src/core/index.rs | 21 +++++++++++- src/core/searcher.rs | 10 +++++- src/query/automaton_weight.rs | 2 +- src/query/term_query/term_weight.rs | 2 +- src/query/weight.rs | 13 +++++--- src/snippet/mod.rs | 50 +++++++++++++++++++++++------ 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index efdfedc5f..c6f465eef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -29,6 +29,9 @@ use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; use IndexWriter; +use schema::FieldType; +use schema::Field; +use tokenizer::BoxedTokenizer; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -113,6 +116,22 @@ impl Index { &self.tokenizers } + pub fn tokenizer_for_field(&self, field: Field) -> Option> { + let field_type = self.schema.get_field_entry(field).field_type(); + let tokenizer: &TokenizerManager = self.tokenizers(); + match field_type { + FieldType::Str(text_options) => { + text_options.get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer()) + .and_then(|tokenizer_name| tokenizer.get(tokenizer_name)) + + }, + _ => { + None + } + } + } + /// Opens a new directory from an index path. #[cfg(feature = "mmap")] pub fn open_in_dir>(directory_path: P) -> Result { @@ -257,7 +276,7 @@ impl Index { let schema = self.schema(); let num_searchers: usize = self.num_searchers.load(Ordering::Acquire); let searchers = (0..num_searchers) - .map(|_| Searcher::new(schema.clone(), segment_readers.clone())) + .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone())) .collect(); self.searcher_pool.publish_new_generation(searchers); Ok(()) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8f36b58ea..9de6c857c 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use termdict::TermMerger; use DocAddress; use Result; +use Index; /// Holds a list of `SegmentReader`s ready for search. /// @@ -18,17 +19,24 @@ use Result; /// pub struct Searcher { schema: Schema, + index: Index, segment_readers: Vec, } impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, segment_readers: Vec) -> Searcher { + pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec) -> Searcher { Searcher { schema, + index, segment_readers, } } + + pub fn index(&self) -> &Index { + &self.index + } + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 54f8c5f8b..854ecb66e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -75,7 +75,7 @@ where inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); let term = Term::from_field_bytes(self.field, &term_buffer[..]); for &doc_id in &docs_matching_current_term { - matching_terms.add_term(doc_id, term.clone()); + matching_terms.add_term(doc_id, term.clone(), 1f32); } } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 1a9075b5a..aa1b5e456 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -49,7 +49,7 @@ impl Weight for TermWeight { for doc_id in doc_ids { match scorer.skip_next(doc_id) { SkipResult::Reached => { - matching_terms.add_term(doc_id, self.term.clone()); + matching_terms.add_term(doc_id, self.term.clone(), 1f32); } SkipResult::OverStep => {} SkipResult::End => { diff --git a/src/query/weight.rs b/src/query/weight.rs index 5b603ab1c..8a12c01da 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -5,9 +5,10 @@ use DocId; use std::collections::HashSet; use Term; use std::collections::BTreeMap; +use std::collections::HashMap; pub struct MatchingTerms { - doc_to_terms: BTreeMap> + doc_to_terms: BTreeMap> } impl MatchingTerms { @@ -16,18 +17,22 @@ impl MatchingTerms { doc_to_terms: doc_ids .iter() .cloned() - .map(|doc_id| (doc_id, HashSet::default())) + .map(|doc_id| (doc_id, HashMap::default())) .collect() } } + pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap> { + self.doc_to_terms.get(&doc_id) + } + pub fn sorted_doc_ids(&self) -> Vec { self.doc_to_terms.keys().cloned().collect() } - pub fn add_term(&mut self, doc_id: DocId, term: Term) { + pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) { if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { - terms.insert(term); + terms.insert(term, score); } } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cd194e0d8..c82777782 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -12,6 +12,9 @@ use DocAddress; use DocId; use Searcher; use query::MatchingTerms; +use schema::Field; +use std::collections::HashMap; +use SegmentLocalId; #[derive(Debug)] pub struct HighlightSection { @@ -189,28 +192,55 @@ fn select_best_fragment_combination<'a>( -fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> { +fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result> { let weight = query.weight(searcher, false)?; let mut doc_groups = doc_addresses .iter() .group_by(|doc_address| doc_address.0); + let mut matching_terms_per_segment: HashMap = HashMap::new(); for (segment_ord, doc_addrs) in doc_groups.into_iter() { let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); let segment_reader = searcher.segment_reader(segment_ord); weight.matching_terms(segment_reader, &mut matching_terms)?; + matching_terms_per_segment.insert(segment_ord, matching_terms); } - let terms = HashSet<(DocId, Vec)>; - Ok(()) + Ok(matching_terms_per_segment) } -pub fn generate_snippet<'a>( - doc: &'a [DocAddress], - index: &Index, +pub fn generate_snippet( + doc_addresses: &[DocAddress], + fields: &[Field], + searcher: &Searcher, query: &Query, - terms: Vec, - max_num_chars: usize) -> Snippet { - search_fragments(boxed_tokenizer, &text, terms, 3); + max_num_chars: usize) -> Result> { + // TODO sort doc_addresses + let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; + for doc_address in doc_addresses { + let doc = searcher.doc(doc_address)?; + for &field in fields { + let mut text = String::new(); + for value in doc.get_all(field) { + text.push_str(value.text()); + } + if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) { + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + search_fragments(tokenizer, + &text, + terms, + max_num_chars); + } + } + } + } + } + // search_fragments(boxed_tokenizer, &text, terms, 3); + panic!("e"); } #[cfg(test)] @@ -346,7 +376,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let text = "a b c d"; - let mut terms = BTreeMap::new(); + let terms = BTreeMap::new(); let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); assert_eq!(fragments.len(), 0); From 2e44f0f09901664b91129189d4da02aa16537b78 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 9 Sep 2018 14:23:24 +0900 Subject: [PATCH 16/20] blop --- src/snippet/mod.rs | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c82777782..65d50575c 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -209,32 +209,37 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D } pub fn generate_snippet( - doc_addresses: &[DocAddress], - fields: &[Field], searcher: &Searcher, + field: Field, query: &Query, + doc_addresses: &[DocAddress], max_num_chars: usize) -> Result> { + + let mut doc_address_ords: Vec = (0..doc_addresses.len()).collect(); + doc_address_ords.sort_by_key(|k| doc_addresses[*k]); + // TODO sort doc_addresses let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; - for doc_address in doc_addresses { + for doc_address in doc_addresses { + let segment_ord: u32 = doc_address.segment_ord(); let doc = searcher.doc(doc_address)?; - for &field in fields { - let mut text = String::new(); - for value in doc.get_all(field) { - text.push_str(value.text()); - } + + let mut text = String::new(); + for value in doc.get_all(field) { + text.push_str(value.text()); + } + + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) { if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { - if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) { - if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { - let terms: BTreeMap = terms - .iter() - .map(|(term, score)| (term.text().to_string(), *score)) - .collect(); - search_fragments(tokenizer, - &text, - terms, - max_num_chars); - } + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + let fragment_candidates = search_fragments(tokenizer, + &text, + terms, + max_num_chars); } } } From e32dba1a9747ee2ed68988d910c518d8e4318229 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 10 Sep 2018 09:26:33 +0900 Subject: [PATCH 17/20] Phrase weight --- src/query/automaton_weight.rs | 42 --------- src/query/boolean_query/boolean_query.rs | 8 ++ src/query/boolean_query/boolean_weight.rs | 9 -- src/query/mod.rs | 2 - src/query/phrase_query/phrase_query.rs | 7 ++ src/query/phrase_query/phrase_weight.rs | 5 - src/query/query.rs | 5 +- src/query/range_query.rs | 5 - src/query/term_query/term_query.rs | 4 + src/query/term_query/term_weight.rs | 21 ----- src/query/weight.rs | 34 ------- src/snippet/mod.rs | 110 ++++++++-------------- 12 files changed, 61 insertions(+), 191 deletions(-) diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 854ecb66e..9ff7b8594 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,7 +7,6 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; -use query::weight::MatchingTerms; use SkipResult; use Term; use DocId; @@ -41,47 +40,6 @@ impl Weight for AutomatonWeight where A: Automaton, { - fn matching_terms(&self, - reader: &SegmentReader, - matching_terms: &mut MatchingTerms) -> Result<()> { - let max_doc = reader.max_doc(); - let mut doc_bitset = BitSet::with_max_value(max_doc); - - let inverted_index = reader.inverted_index(self.field); - let term_dict = inverted_index.terms(); - let mut term_stream = self.automaton_stream(term_dict); - - let doc_ids = matching_terms.sorted_doc_ids(); - let mut docs_matching_current_term: Vec = vec![]; - - let mut term_buffer: Vec = vec![]; - - while term_stream.advance() { - docs_matching_current_term.clear(); - let term_info = term_stream.value(); - let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); - for &doc_id in &doc_ids { - match segment_postings.skip_next(doc_id) { - SkipResult::Reached => { - docs_matching_current_term.push(doc_id); - } - SkipResult::OverStep => {} - SkipResult::End => {} - } - } - if !docs_matching_current_term.is_empty() { - term_buffer.clear(); - let term_ord = term_stream.term_ord(); - inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); - let term = Term::from_field_bytes(self.field, &term_buffer[..]); - for &doc_id in &docs_matching_current_term { - matching_terms.add_term(doc_id, term.clone(), 1f32); - } - } - } - Ok(()) - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 286d9f449..b92a203eb 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -6,6 +6,7 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; +use std::collections::BTreeSet; use Searcher; /// The boolean query combines a set of queries @@ -40,6 +41,7 @@ impl From)>> for BooleanQuery { } impl Query for BooleanQuery { + fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { let sub_weights = self.subqueries .iter() @@ -49,6 +51,12 @@ impl Query for BooleanQuery { .collect::>()?; Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_occur, subquery) in &self.subqueries { + subquery.query_terms(term_set); + } + } } impl BooleanQuery { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 2b3348a21..575bc2991 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -13,7 +13,6 @@ use query::Weight; use std::borrow::Borrow; use std::collections::HashMap; use Result; -use query::MatchingTerms; fn scorer_union(scorers: Vec>) -> Box where @@ -108,14 +107,6 @@ impl BooleanWeight { } impl Weight for BooleanWeight { - - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - for (_, weight) in &self.weights { - weight.matching_terms(reader, matching_terms)?; - } - Ok(()) - } - fn scorer(&self, reader: &SegmentReader) -> Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) diff --git a/src/query/mod.rs b/src/query/mod.rs index 0b6ee2adb..73a77174b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -27,8 +27,6 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; -pub use self::weight::MatchingTerms; - pub use self::intersection::Intersection; pub use self::union::Union; diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index e501711ed..d103461c1 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -6,6 +6,7 @@ use query::Query; use query::Weight; use schema::{Field, Term}; use Result; +use std::collections::BTreeSet; /// `PhraseQuery` matches a specific sequence of words. /// @@ -107,4 +108,10 @@ impl Query for PhraseQuery { ))) } } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_, query_term) in &self.phrase_terms { + term_set.insert(query_term.clone()); + } + } } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index fbf43db20..69ab4e184 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -7,7 +7,6 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; -use query::MatchingTerms; pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, @@ -32,10 +31,6 @@ impl PhraseWeight { impl Weight for PhraseWeight { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - unimplemented!(); - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/query.rs b/src/query/query.rs index 7004768e4..a72c33d00 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -6,7 +6,8 @@ use std::fmt; use Result; use SegmentLocalId; use DocAddress; -use query::weight::MatchingTerms; +use std::collections::BTreeSet; +use Term; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. @@ -60,6 +61,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + fn query_terms(&self, term_set: &mut BTreeSet) {} + /// Search works as follows : /// /// First the weight object associated to the query is created. diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 2b22e7cf8..06d98db66 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,7 +11,6 @@ use std::collections::Bound; use std::ops::Range; use termdict::{TermDictionary, TermStreamer}; use Result; -use query::MatchingTerms; fn map_bound TTo>( bound: &Bound, @@ -276,10 +275,6 @@ impl RangeWeight { impl Weight for RangeWeight { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - unimplemented!(); - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 9ba10b307..d6cd72288 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -6,6 +6,7 @@ use schema::IndexRecordOption; use Result; use Searcher; use Term; +use std::collections::BTreeSet; /// A Term query matches all of the documents /// containing a specific term. @@ -110,4 +111,7 @@ impl Query for TermQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { Ok(Box::new(self.specialized_weight(searcher, scoring_enabled))) } + fn query_terms(&self, term_set: &mut BTreeSet) { + term_set.insert(self.term.clone()); + } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index aa1b5e456..162abe519 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -9,7 +9,6 @@ use schema::IndexRecordOption; use Result; use Term; use SkipResult; -use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -40,26 +39,6 @@ impl Weight for TermWeight { } } - - fn matching_terms(&self, - reader: &SegmentReader, - matching_terms: &mut MatchingTerms) -> Result<()> { - let doc_ids = matching_terms.sorted_doc_ids(); - let mut scorer = self.scorer(reader)?; - for doc_id in doc_ids { - match scorer.skip_next(doc_id) { - SkipResult::Reached => { - matching_terms.add_term(doc_id, self.term.clone(), 1f32); - } - SkipResult::OverStep => {} - SkipResult::End => { - break; - } - } - } - Ok(()) - } - fn count(&self, reader: &SegmentReader) -> Result { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index 8a12c01da..8bca9ad16 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -7,36 +7,6 @@ use Term; use std::collections::BTreeMap; use std::collections::HashMap; -pub struct MatchingTerms { - doc_to_terms: BTreeMap> -} - -impl MatchingTerms { - pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { - MatchingTerms { - doc_to_terms: doc_ids - .iter() - .cloned() - .map(|doc_id| (doc_id, HashMap::default())) - .collect() - } - } - - pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap> { - self.doc_to_terms.get(&doc_id) - } - - pub fn sorted_doc_ids(&self) -> Vec { - self.doc_to_terms.keys().cloned().collect() - } - - pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) { - if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { - terms.insert(term, score); - } - } -} - /// A Weight is the specialization of a Query /// for a given set of segments. /// @@ -46,10 +16,6 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - Ok(()) - } - /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result { Ok(self.scorer(reader)?.count()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index ffd6613e3..39d1ff89c 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -11,11 +11,11 @@ use query::Query; use DocAddress; use DocId; use Searcher; -use query::MatchingTerms; use schema::Field; use std::collections::HashMap; use SegmentLocalId; use error::TantivyError; +use std::collections::BTreeSet; #[derive(Debug)] pub struct HighlightSection { @@ -129,9 +129,9 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: Box, + tokenizer: &BoxedTokenizer, text: &'a str, - terms: BTreeMap, + terms: &BTreeMap, max_num_chars: usize, ) -> Vec { let mut token_stream = tokenizer.token_stream(text); @@ -199,75 +199,41 @@ fn select_best_fragment_combination<'a>( } +const DEFAULT_MAX_NUM_CHARS: usize = 150; - -fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result> { - let weight = query.weight(searcher, false)?; - let mut doc_groups = doc_addresses - .iter() - .group_by(|doc_address| doc_address.0); - let mut matching_terms_per_segment: HashMap = HashMap::new(); - for (segment_ord, doc_addrs) in doc_groups.into_iter() { - let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); - let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); - let segment_reader = searcher.segment_reader(segment_ord); - weight.matching_terms(segment_reader, &mut matching_terms)?; - matching_terms_per_segment.insert(segment_ord, matching_terms); - } - Ok(matching_terms_per_segment) +pub struct SnippetGenerator { + terms_text: BTreeMap, + tokenizer: Box, + max_num_chars: usize } -pub fn generate_snippet( - searcher: &Searcher, - query: &Query, - field: Field, - doc_addresses: &[DocAddress], - max_num_chars: usize) -> Result> { - - let mut doc_address_ords: Vec = (0..doc_addresses.len()).collect(); - doc_address_ords.sort_by_key(|k| doc_addresses[*k]); - - let mut snippets = vec![]; - let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; - - for &doc_address_ord in &doc_address_ords { - let doc_address = doc_addresses[doc_address_ord]; - let segment_ord: u32 = doc_address.segment_ord(); - let doc = searcher.doc(&doc_address)?; - - let mut text = String::new(); - for value in doc.get_all(field) { - text.push_str(value.text()); - } - - - if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) { - let tokenizer = searcher.index().tokenizer_for_field(field)?; - if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { - let terms: BTreeMap = terms - .iter() - .map(|(term, score)| (term.text().to_string(), *score)) - .collect(); - let fragment_candidates = search_fragments(tokenizer, - &text, - terms, - max_num_chars); - let snippet = select_best_fragment_combination(fragment_candidates, &text); - snippets.push(snippet); - } else { - snippets.push(Snippet::empty()); - } - } else { - - } +impl SnippetGenerator { + pub fn new(searcher: &Searcher, + query: &Query, + field: Field) -> Result { + let mut terms = BTreeSet::new(); + query.query_terms(&mut terms); + let terms_text: BTreeMap = terms.into_iter() + .filter(|term| term.field() == field) + .map(|term| (term.text().to_string(), 1f32)) + .collect(); + let tokenizer = searcher.index().tokenizer_for_field(field)?; + Ok(SnippetGenerator { + terms_text, + tokenizer, + max_num_chars: DEFAULT_MAX_NUM_CHARS + }) } - // reorder the snippets - for i in 0..doc_addresses.len() { - snippets.swap(i, doc_address_ords[i]); - } + pub fn snippet(&self, text: &str) -> Snippet { + let fragment_candidates = search_fragments(&*self.tokenizer, + &text, + &self.terms_text, + self.max_num_chars); + let snippet = select_best_fragment_combination(fragment_candidates, &text); + snippet - Ok(snippets) + } } #[cfg(test)] @@ -294,7 +260,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 100); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); @@ -315,7 +281,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -339,7 +305,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -364,7 +330,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 7); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -388,7 +354,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -404,7 +370,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(fragments, &text); From 644d8a3a10bd6ad292360562e6c6a302bb709f78 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 10 Sep 2018 16:39:45 +0900 Subject: [PATCH 18/20] Added snippet generator --- examples/snippet.rs | 10 ++- src/core/index.rs | 2 +- src/lib.rs | 1 + src/query/automaton_weight.rs | 4 -- src/query/query.rs | 3 +- src/query/term_query/term_weight.rs | 1 - src/query/weight.rs | 5 -- src/snippet/mod.rs | 104 +++++++++++++++++++--------- 8 files changed, 78 insertions(+), 52 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 4efea1e5a..3cded2bd1 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -14,6 +14,8 @@ use tantivy::collector::TopCollector; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; +use tantivy::SnippetGenerator; +use tempdir::TempDir; fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the @@ -53,15 +55,14 @@ fn main() -> tantivy::Result<()> { let searcher = index.searcher(); let query_parser = QueryParser::for_index(&index, vec![title, body]); - let query = query_parser.parse_query("sycamore spring")?; let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; - let doc_addresses = top_collector.docs(); + let snippet_generator = + let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { let retrieved_doc = searcher.doc(&doc_address)?; // generate_snippet(&retrieved_doc, query @@ -69,6 +70,3 @@ fn main() -> tantivy::Result<()> { Ok(()) } - - -use tempdir::TempDir; diff --git a/src/core/index.rs b/src/core/index.rs index 6c236ff5f..f0df65b75 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -136,7 +136,7 @@ impl Index { Ok(tokenizer) } None => { - Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name()))) + Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name()))) } } } diff --git a/src/lib.rs b/src/lib.rs index 5806d5f69..ef00ec4ee 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -214,6 +214,7 @@ pub mod store; pub mod termdict; mod snippet; +pub use self::snippet::SnippetGenerator; mod docset; pub use self::docset::{DocSet, SkipResult}; diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 9ff7b8594..b38e6592d 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,10 +7,6 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; -use SkipResult; -use Term; -use DocId; -use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight diff --git a/src/query/query.rs b/src/query/query.rs index a72c33d00..9bf139b96 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,7 +5,6 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; -use DocAddress; use std::collections::BTreeSet; use Term; @@ -61,7 +60,7 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } - fn query_terms(&self, term_set: &mut BTreeSet) {} + fn query_terms(&self, _term_set: &mut BTreeSet) {} /// Search works as follows : /// diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 162abe519..ba45a8042 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,7 +8,6 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; -use SkipResult; pub struct TermWeight { term: Term, diff --git a/src/query/weight.rs b/src/query/weight.rs index 8bca9ad16..d3d8b3520 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,11 +1,6 @@ use super::Scorer; use core::SegmentReader; use Result; -use DocId; -use std::collections::HashSet; -use Term; -use std::collections::BTreeMap; -use std::collections::HashMap; /// A Weight is the specialization of a Query /// for a given set of segments. diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 39d1ff89c..9842cdd00 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,21 +1,12 @@ use htmlescape::encode_minimal; -use schema::FieldValue; use std::collections::BTreeMap; -use itertools::Itertools; -use tokenizer::BoxedTokenizer; use tokenizer::{Token, TokenStream}; -use Index; use Result; -use Term; use query::Query; -use DocAddress; -use DocId; use Searcher; use schema::Field; -use std::collections::HashMap; -use SegmentLocalId; -use error::TantivyError; use std::collections::BTreeSet; +use tokenizer::BoxedTokenizer; #[derive(Debug)] pub struct HighlightSection { @@ -225,14 +216,16 @@ impl SnippetGenerator { }) } + pub fn set_max_num_chars(&mut self, max_num_chars: usize) { + self.max_num_chars = max_num_chars; + } + pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, &self.terms_text, self.max_num_chars); - let snippet = select_best_fragment_combination(fragment_candidates, &text); - snippet - + select_best_fragment_combination(fragment_candidates, &text) } } @@ -242,39 +235,47 @@ mod tests { use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; + use Index; + use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; + use SnippetGenerator; + use query::QueryParser; - const TOKENIZER: SimpleTokenizer = SimpleTokenizer; + + const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which +describes it as a "safe, concurrent, practical language", supporting functional and +imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], +but its designers intend it to provide better memory safety while still maintaining +performance. + +Rust is free and open-source software, released under an MIT License, or Apache License +2.0. Its designers have refined the language through the experiences of writing the Servo +web browser layout engine[14] and the Rust compiler. A large proportion of current commits +to the project are from community members.[15] + +Rust won first place for "most loved programming language" in the Stack Overflow Developer +Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); - - let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. - -Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] - -Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. -"; - + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let mut terms = BTreeMap::new(); terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - - let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 100); + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(fragments, &text); - assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) + let snippet = select_best_fragment_combination(fragments, &TEST_TEXT); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) } #[test] fn test_snippet_in_second_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -298,7 +299,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_at_the_end_of_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f f g"; @@ -322,7 +323,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_second_fragment_has_the_highest_score() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -347,7 +348,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_not_in_text() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -365,7 +366,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_no_terms() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -377,4 +378,41 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.fragments, ""); assert_eq!(snippet.to_html(), ""); } + + #[test] + fn test_snippet_generator() { + let mut schema_builder = SchemaBuilder::default (); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic) + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc ! (text_field => TEST_TEXT); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + } + { + snippet_generator.set_max_num_chars(90); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + } + + } } From 63868733a38fa57fccf3d2e6e52ae1c5462a01ba Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 11 Sep 2018 09:45:27 +0900 Subject: [PATCH 19/20] Added SnippetGenerator --- Cargo.toml | 1 - examples/snippet.rs | 9 ++-- src/common/mod.rs | 21 +--------- src/core/index.rs | 25 +++++++++++ src/core/searcher.rs | 1 + src/indexer/merger.rs | 10 ++--- src/lib.rs | 6 +-- src/query/query.rs | 2 + src/schema/schema.rs | 4 +- src/schema/value.rs | 6 +-- src/snippet/mod.rs | 67 ++++++++++++++++++++++++++++-- src/store/mod.rs | 2 +- src/tokenizer/mod.rs | 4 +- src/tokenizer/tokenizer.rs | 2 +- src/tokenizer/tokenizer_manager.rs | 2 +- 15 files changed, 117 insertions(+), 45 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 887f7a9b9..6a9b313f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,6 @@ opt-level = 3 debug = false lto = true debug-assertions = false -overflow-checks = false [profile.test] debug-assertions = true diff --git a/examples/snippet.rs b/examples/snippet.rs index 3cded2bd1..bc31a3e38 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -35,7 +35,6 @@ fn main() -> tantivy::Result<()> { let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); - let mut old_man_doc = Document::default(); // we'll only need one doc for this example. index_writer.add_document(doc!( title => "Of Mice and Men", @@ -60,12 +59,14 @@ fn main() -> tantivy::Result<()> { let mut top_collector = TopCollector::with_limit(10); searcher.search(&*query, &mut top_collector)?; - let snippet_generator = + let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?; let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; - // generate_snippet(&retrieved_doc, query + let doc = searcher.doc(&doc_address)?; + let snippet = snippet_generator.snippet_from_doc(&doc); + println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); + println!("snippet: {}", snippet.to_html()); } Ok(()) diff --git a/src/common/mod.rs b/src/common/mod.rs index 778f0476a..2942438b4 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,17 +68,6 @@ pub trait HasLen { } } - -pub fn is_stricly_sorted(els: &[T]) -> bool { - if els.is_empty() { - true - } else { - els.iter() - .zip(els[1..].iter()) - .all(|(left, right)| left < right) - } -} - const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -116,20 +105,12 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } - - #[test] - fn test_is_strictly_sorted() { - assert!(is_stricly_sorted::(&[])); - assert!(is_stricly_sorted(&[1])); - assert!(is_stricly_sorted(&[1, 2, 3])); - assert!(!is_stricly_sorted(&[1, 3, 2])); - } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/core/index.rs b/src/core/index.rs index f0df65b75..da1744961 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -115,6 +115,8 @@ impl Index { &self.tokenizers } + + /// Helper to access the tokenizer associated to a specific field. pub fn tokenizer_for_field(&self, field: Field) -> Result> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -325,3 +327,26 @@ impl Clone for Index { } } } + + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT, INT_INDEXED}; + + #[test] + fn test_indexer_for_field() { + let mut schema_builder = SchemaBuilder::default(); + let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); + let body_field = schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + assert!(index.tokenizer_for_field(body_field).is_ok()); + assert_eq!( + format!("{:?}", index.tokenizer_for_field(num_likes_field).err()), + "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))" + ); + } + + +} \ No newline at end of file diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9de6c857c..f17df042f 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -33,6 +33,7 @@ impl Searcher { } } + /// Returns the `Index` associated to the `Searcher` pub fn index(&self) -> &Index { &self.index } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index e79551a4c..5d2e17c51 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -770,23 +770,23 @@ mod tests { } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec| { diff --git a/src/lib.rs b/src/lib.rs index ef00ec4ee..33fb62eb8 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -899,11 +899,11 @@ mod tests { assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), "tantivy"); - assert_eq!(values[1].text(), "some other value"); + assert_eq!(values[0].text(), Some("tantivy")); + assert_eq!(values[1].text(), Some("some other value")); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), "short"); + assert_eq!(values[0].text(), Some("short")); } #[test] diff --git a/src/query/query.rs b/src/query/query.rs index 9bf139b96..6abbf35e0 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -60,6 +60,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + /// Extract all of the terms associated to the query and insert them in the + /// term set given in arguments. fn query_terms(&self, _term_set: &mut BTreeSet) {} /// Search works as follows : diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 6d4f6c949..d000ab9e2 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -443,8 +443,8 @@ mod tests { }"#, ) .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); - assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); + assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); + assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton")); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/schema/value.rs b/src/schema/value.rs index f5ce151f1..64b0dc795 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -74,10 +74,10 @@ impl Value { /// /// # Panics /// If the value is not of type `Str` - pub fn text(&self) -> &str { + pub fn text(&self) -> Option<&str> { match *self { - Value::Str(ref text) => text, - _ => panic!("This is not a text field."), + Value::Str(ref text) => Some(text), + _ => None, } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 9842cdd00..6703d6411 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -7,6 +7,9 @@ use Searcher; use schema::Field; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; +use Document; + +const DEFAULT_MAX_NUM_CHARS: usize = 150; #[derive(Debug)] pub struct HighlightSection { @@ -189,16 +192,58 @@ fn select_best_fragment_combination<'a>( } } - -const DEFAULT_MAX_NUM_CHARS: usize = 150; - +/// `SnippetGenerator` +/// +/// # Example +/// +/// ```rust +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::query::QueryParser; +/// use tantivy::SnippetGenerator; +/// +/// # fn main() -> tantivy::Result<()> { +/// # let mut schema_builder = SchemaBuilder::default(); +/// # let text_field = schema_builder.add_text_field("text", TEXT); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; +/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +/// # Je ne me sentis plus guidé par les haleurs : +/// # Des Peaux-Rouges criards les avaient pris pour cibles, +/// # Les ayant cloués nus aux poteaux de couleurs. +/// # +/// # J'étais insoucieux de tous les équipages, +/// # Porteur de blés flamands ou de cotons anglais. +/// # Quand avec mes haleurs ont fini ces tapages, +/// # Les Fleuves m'ont laissé descendre où je voulais. +/// # "#); +/// # index_writer.add_document(doc.clone()); +/// # index_writer.commit()?; +/// # let query_parser = QueryParser::for_index(&index, vec![text_field]); +/// // ... +/// let query = query_parser.parse_query("haleurs flamands").unwrap(); +/// # index.load_searchers()?; +/// # let searcher = index.searcher(); +/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?; +/// snippet_generator.set_max_num_chars(100); +/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet_html: String = snippet.to_html(); +/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); +/// # Ok(()) +/// # } +/// ``` pub struct SnippetGenerator { terms_text: BTreeMap, tokenizer: Box, + field: Field, max_num_chars: usize } impl SnippetGenerator { + /// Creates a new snippet generator pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result { @@ -212,14 +257,30 @@ impl SnippetGenerator { Ok(SnippetGenerator { terms_text, tokenizer, + field, max_num_chars: DEFAULT_MAX_NUM_CHARS }) } + /// Sets a maximum number of chars. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; } + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated to the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + let text: String = doc.get_all(self.field) + .into_iter() + .flat_map(|val| val.text()) + .collect::>() + .join(" "); + self.snippet(&text) + } + + /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, diff --git a/src/store/mod.rs b/src/store/mod.rs index 5d71563e1..7bce9085d 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,7 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text(), + *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(), format!("Doc {}", i) ); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index d4a735bd2..e8bb3527f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,9 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; +pub use self::tokenizer::BoxedTokenizer; +pub(crate) use self::tokenizer::box_tokenizer; + pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e806b70d8..fcdf8f21b 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub fn box_tokenizer(a: A) -> Box +pub(crate) fn box_tokenizer(a: A) -> Box where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, { diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..447dea303 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use tokenizer::tokenizer::box_tokenizer; +use tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; From cc23194c581a1b8b56b6e62fd932929701dd83b2 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 11 Sep 2018 10:05:15 +0900 Subject: [PATCH 20/20] Editing document --- CHANGELOG.md | 2 +- src/snippet/mod.rs | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2256923a..718840223 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ Tantivy 0.7 greatly improving performance - Tantivy error now rely on the failure crate (@drusellers) - Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax - +- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton) Tantivy 0.6.1 ========================= diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 6703d6411..a3d2c48e3 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -8,6 +8,7 @@ use schema::Field; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; use Document; +use std::cmp::Ordering; const DEFAULT_MAX_NUM_CHARS: usize = 150; @@ -156,18 +157,17 @@ fn select_best_fragment_combination<'a>( fragments: Vec, text: &'a str, ) -> Snippet { - if let Some(init) = fragments.iter().nth(0) { - let fragment = - fragments.iter().skip(1).fold( - init, - |acc, item| { - if item.score > acc.score { - item - } else { - acc - } - }, - ); + let best_fragment_opt = fragments + .iter() + .max_by(|left, right| { + let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal); + if cmp_score == Ordering::Equal { + (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) + } else { + cmp_score + } + }); + if let Some(fragment) = best_fragment_opt { let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; let highlighted = fragment .highlighted @@ -179,7 +179,7 @@ fn select_best_fragment_combination<'a>( ) }).collect(); Snippet { - fragments: fragment_text.to_owned(), + fragments: fragment_text.to_string(), highlighted: highlighted, } } else {