diff --git a/examples/snippet.rs b/examples/snippet.rs index 4efea1e5a..3cded2bd1 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -14,6 +14,8 @@ use tantivy::collector::TopCollector; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; +use tantivy::SnippetGenerator; +use tempdir::TempDir; fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the @@ -53,15 +55,14 @@ fn main() -> tantivy::Result<()> { let searcher = index.searcher(); let query_parser = QueryParser::for_index(&index, vec![title, body]); - let query = query_parser.parse_query("sycamore spring")?; let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; - let doc_addresses = top_collector.docs(); + let snippet_generator = + let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { let retrieved_doc = searcher.doc(&doc_address)?; // generate_snippet(&retrieved_doc, query @@ -69,6 +70,3 @@ fn main() -> tantivy::Result<()> { Ok(()) } - - -use tempdir::TempDir; diff --git a/src/core/index.rs b/src/core/index.rs index 6c236ff5f..f0df65b75 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -136,7 +136,7 @@ impl Index { Ok(tokenizer) } None => { - Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name()))) + Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name()))) } } } diff --git a/src/lib.rs b/src/lib.rs index 5806d5f69..ef00ec4ee 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -214,6 +214,7 @@ pub mod store; pub mod termdict; mod snippet; +pub use self::snippet::SnippetGenerator; mod docset; pub use self::docset::{DocSet, SkipResult}; diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 9ff7b8594..b38e6592d 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,10 +7,6 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; -use SkipResult; -use Term; -use DocId; -use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight diff --git a/src/query/query.rs b/src/query/query.rs index a72c33d00..9bf139b96 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,7 +5,6 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; -use DocAddress; use std::collections::BTreeSet; use Term; @@ -61,7 +60,7 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } - fn query_terms(&self, term_set: &mut BTreeSet) {} + fn query_terms(&self, _term_set: &mut BTreeSet) {} /// Search works as follows : /// diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 162abe519..ba45a8042 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,7 +8,6 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; -use SkipResult; pub struct TermWeight { term: Term, diff --git a/src/query/weight.rs b/src/query/weight.rs index 8bca9ad16..d3d8b3520 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,11 +1,6 @@ use super::Scorer; use core::SegmentReader; use Result; -use DocId; -use std::collections::HashSet; -use Term; -use std::collections::BTreeMap; -use std::collections::HashMap; /// A Weight is the specialization of a Query /// for a given set of segments. diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 39d1ff89c..9842cdd00 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,21 +1,12 @@ use htmlescape::encode_minimal; -use schema::FieldValue; use std::collections::BTreeMap; -use itertools::Itertools; -use tokenizer::BoxedTokenizer; use tokenizer::{Token, TokenStream}; -use Index; use Result; -use Term; use query::Query; -use DocAddress; -use DocId; use Searcher; use schema::Field; -use std::collections::HashMap; -use SegmentLocalId; -use error::TantivyError; use std::collections::BTreeSet; +use tokenizer::BoxedTokenizer; #[derive(Debug)] pub struct HighlightSection { @@ -225,14 +216,16 @@ impl SnippetGenerator { }) } + pub fn set_max_num_chars(&mut self, max_num_chars: usize) { + self.max_num_chars = max_num_chars; + } + pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, &self.terms_text, self.max_num_chars); - let snippet = select_best_fragment_combination(fragment_candidates, &text); - snippet - + select_best_fragment_combination(fragment_candidates, &text) } } @@ -242,39 +235,47 @@ mod tests { use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; + use Index; + use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; + use SnippetGenerator; + use query::QueryParser; - const TOKENIZER: SimpleTokenizer = SimpleTokenizer; + + const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which +describes it as a "safe, concurrent, practical language", supporting functional and +imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], +but its designers intend it to provide better memory safety while still maintaining +performance. + +Rust is free and open-source software, released under an MIT License, or Apache License +2.0. Its designers have refined the language through the experiences of writing the Servo +web browser layout engine[14] and the Rust compiler. A large proportion of current commits +to the project are from community members.[15] + +Rust won first place for "most loved programming language" in the Stack Overflow Developer +Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); - - let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. - -Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] - -Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. -"; - + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let mut terms = BTreeMap::new(); terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - - let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 100); + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(fragments, &text); - assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) + let snippet = select_best_fragment_combination(fragments, &TEST_TEXT); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) } #[test] fn test_snippet_in_second_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -298,7 +299,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_at_the_end_of_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f f g"; @@ -322,7 +323,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_second_fragment_has_the_highest_score() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -347,7 +348,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_not_in_text() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -365,7 +366,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_no_terms() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -377,4 +378,41 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.fragments, ""); assert_eq!(snippet.to_html(), ""); } + + #[test] + fn test_snippet_generator() { + let mut schema_builder = SchemaBuilder::default (); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic) + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc ! (text_field => TEST_TEXT); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + } + { + snippet_generator.set_max_num_chars(90); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + } + + } }