diff --git a/CHANGELOG.md b/CHANGELOG.md index d2256923a..718840223 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ Tantivy 0.7 greatly improving performance - Tantivy error now rely on the failure crate (@drusellers) - Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax - +- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton) Tantivy 0.6.1 ========================= diff --git a/Cargo.toml b/Cargo.toml index 53a318fc2..67bf67824 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ census = "0.1" fnv = "1.0.6" owned-read = "0.4" failure = "0.1" +htmlescape = "0.3.1" fail = "0.2" [target.'cfg(windows)'.dependencies] @@ -60,7 +61,6 @@ opt-level = 3 debug = false lto = true debug-assertions = false -overflow-checks = false [profile.test] debug-assertions = true diff --git a/examples/snippet.rs b/examples/snippet.rs new file mode 100644 index 000000000..bc31a3e38 --- /dev/null +++ b/examples/snippet.rs @@ -0,0 +1,73 @@ +// # Snippet example +// +// This example shows how to return a representative snippet of +// your hit result. +// Snippet are an extracted of a target document, and returned in HTML format. +// The keyword searched by the user are highlighted with a `` tag. +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::Index; +use tantivy::SnippetGenerator; +use tempdir::TempDir; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_example_dir")?; + + // # Defining the schema + let mut schema_builder = SchemaBuilder::default(); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + // we'll only need one doc for this example. + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + // ... + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![title, body]); + let query = query_parser.parse_query("sycamore spring")?; + + let mut top_collector = TopCollector::with_limit(10); + searcher.search(&*query, &mut top_collector)?; + + let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?; + + let doc_addresses = top_collector.docs(); + for doc_address in doc_addresses { + let doc = searcher.doc(&doc_address)?; + let snippet = snippet_generator.snippet_from_doc(&doc); + println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); + println!("snippet: {}", snippet.to_html()); + } + + Ok(()) +} diff --git a/src/core/index.rs b/src/core/index.rs index 2a818f222..da1744961 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -28,6 +28,9 @@ use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; use IndexWriter; +use schema::FieldType; +use schema::Field; +use tokenizer::BoxedTokenizer; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -112,6 +115,34 @@ impl Index { &self.tokenizers } + + /// Helper to access the tokenizer associated to a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> Result> { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let tokenizer_manager: &TokenizerManager = self.tokenizers(); + let tokenizer_name_opt: Option> = + match field_type { + FieldType::Str(text_options) => { + text_options + .get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) + .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)) + }, + _ => { + None + } + }; + match tokenizer_name_opt { + Some(tokenizer) => { + Ok(tokenizer) + } + None => { + Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name()))) + } + } + } + /// Opens a new directory from an index path. #[cfg(feature = "mmap")] pub fn open_in_dir>(directory_path: P) -> Result { @@ -258,7 +289,7 @@ impl Index { let schema = self.schema(); let num_searchers: usize = self.num_searchers.load(Ordering::Acquire); let searchers = (0..num_searchers) - .map(|_| Searcher::new(schema.clone(), segment_readers.clone())) + .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone())) .collect(); self.searcher_pool.publish_new_generation(searchers); Ok(()) @@ -296,3 +327,26 @@ impl Clone for Index { } } } + + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT, INT_INDEXED}; + + #[test] + fn test_indexer_for_field() { + let mut schema_builder = SchemaBuilder::default(); + let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); + let body_field = schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + assert!(index.tokenizer_for_field(body_field).is_ok()); + assert_eq!( + format!("{:?}", index.tokenizer_for_field(num_likes_field).err()), + "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))" + ); + } + + +} \ No newline at end of file diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8f36b58ea..f17df042f 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use termdict::TermMerger; use DocAddress; use Result; +use Index; /// Holds a list of `SegmentReader`s ready for search. /// @@ -18,17 +19,25 @@ use Result; /// pub struct Searcher { schema: Schema, + index: Index, segment_readers: Vec, } impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, segment_readers: Vec) -> Searcher { + pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec) -> Searcher { Searcher { schema, + index, segment_readers, } } + + /// Returns the `Index` associated to the `Searcher` + pub fn index(&self) -> &Index { + &self.index + } + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index e79551a4c..5d2e17c51 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -770,23 +770,23 @@ mod tests { } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec| { diff --git a/src/lib.rs b/src/lib.rs old mode 100644 new mode 100755 index d6073eee1..8f4bc726e --- a/src/lib.rs +++ b/src/lib.rs @@ -154,6 +154,7 @@ extern crate stable_deref_trait; extern crate tempdir; extern crate tempfile; extern crate uuid; +extern crate htmlescape; #[cfg(test)] #[macro_use] @@ -210,6 +211,9 @@ pub mod schema; pub mod store; pub mod termdict; +mod snippet; +pub use self::snippet::SnippetGenerator; + mod docset; pub use self::docset::{DocSet, SkipResult}; @@ -893,11 +897,11 @@ mod tests { assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), "tantivy"); - assert_eq!(values[1].text(), "some other value"); + assert_eq!(values[0].text(), Some("tantivy")); + assert_eq!(values[1].text(), Some("some other value")); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), "short"); + assert_eq!(values[0].text(), Some("short")); } #[test] diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 286d9f449..b92a203eb 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -6,6 +6,7 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; +use std::collections::BTreeSet; use Searcher; /// The boolean query combines a set of queries @@ -40,6 +41,7 @@ impl From)>> for BooleanQuery { } impl Query for BooleanQuery { + fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { let sub_weights = self.subqueries .iter() @@ -49,6 +51,12 @@ impl Query for BooleanQuery { .collect::>()?; Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_occur, subquery) in &self.subqueries { + subquery.query_terms(term_set); + } + } } impl BooleanQuery { diff --git a/src/query/mod.rs b/src/query/mod.rs index 7546465fb..73a77174b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -27,7 +27,6 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; - pub use self::intersection::Intersection; pub use self::union::Union; diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index e501711ed..d103461c1 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -6,6 +6,7 @@ use query::Query; use query::Weight; use schema::{Field, Term}; use Result; +use std::collections::BTreeSet; /// `PhraseQuery` matches a specific sequence of words. /// @@ -107,4 +108,10 @@ impl Query for PhraseQuery { ))) } } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_, query_term) in &self.phrase_terms { + term_set.insert(query_term.clone()); + } + } } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index de8eeb0d2..69ab4e184 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -30,6 +30,7 @@ impl PhraseWeight { } impl Weight for PhraseWeight { + fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/query.rs b/src/query/query.rs index 51e068b92..6abbf35e0 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,6 +5,8 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; +use std::collections::BTreeSet; +use Term; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. @@ -58,6 +60,10 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + /// Extract all of the terms associated to the query and insert them in the + /// term set given in arguments. + fn query_terms(&self, _term_set: &mut BTreeSet) {} + /// Search works as follows : /// /// First the weight object associated to the query is created. diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 23efe1995..06d98db66 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -274,6 +274,7 @@ impl RangeWeight { } impl Weight for RangeWeight { + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 9ba10b307..d6cd72288 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -6,6 +6,7 @@ use schema::IndexRecordOption; use Result; use Searcher; use Term; +use std::collections::BTreeSet; /// A Term query matches all of the documents /// containing a specific term. @@ -110,4 +111,7 @@ impl Query for TermQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { Ok(Box::new(self.specialized_weight(searcher, scoring_enabled))) } + fn query_terms(&self, term_set: &mut BTreeSet) { + term_set.insert(self.term.clone()); + } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 6d4f6c949..d000ab9e2 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -443,8 +443,8 @@ mod tests { }"#, ) .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); - assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); + assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); + assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton")); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/schema/value.rs b/src/schema/value.rs index f5ce151f1..64b0dc795 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -74,10 +74,10 @@ impl Value { /// /// # Panics /// If the value is not of type `Str` - pub fn text(&self) -> &str { + pub fn text(&self) -> Option<&str> { match *self { - Value::Str(ref text) => text, - _ => panic!("This is not a text field."), + Value::Str(ref text) => Some(text), + _ => None, } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs new file mode 100644 index 000000000..a3d2c48e3 --- /dev/null +++ b/src/snippet/mod.rs @@ -0,0 +1,479 @@ +use htmlescape::encode_minimal; +use std::collections::BTreeMap; +use tokenizer::{Token, TokenStream}; +use Result; +use query::Query; +use Searcher; +use schema::Field; +use std::collections::BTreeSet; +use tokenizer::BoxedTokenizer; +use Document; +use std::cmp::Ordering; + +const DEFAULT_MAX_NUM_CHARS: usize = 150; + +#[derive(Debug)] +pub struct HighlightSection { + start: usize, + stop: usize, +} + +impl HighlightSection { + fn new(start: usize, stop: usize) -> HighlightSection { + HighlightSection { start, stop } + } +} + +#[derive(Debug)] +pub struct FragmentCandidate { + score: f32, + start_offset: usize, + stop_offset: usize, + num_chars: usize, + highlighted: Vec, +} + +impl FragmentCandidate { + /// Create a basic `FragmentCandidate` + /// + /// `score`, `num_chars` are set to 0 + /// and `highlighted` is set to empty vec + /// stop_offset is set to start_offset, which is taken as a param. + fn new(start_offset: usize) -> FragmentCandidate { + FragmentCandidate { + score: 0.0, + start_offset: start_offset, + stop_offset: start_offset, + num_chars: 0, + highlighted: vec![], + } + } + + /// Updates `score` and `highlighted` fields of the objects. + /// + /// taking the token and terms, the token is added to the fragment. + /// if the token is one of the terms, the score + /// and highlighted fields are updated in the fragment. + fn try_add_token(&mut self, token: &Token, terms: &BTreeMap) { + self.stop_offset = token.offset_to; + + if let Some(score) = terms.get(&token.text.to_lowercase()) { + self.score += score; + self.highlighted + .push(HighlightSection::new(token.offset_from, token.offset_to)); + } + } +} + +#[derive(Debug)] +pub struct Snippet { + fragments: String, + highlighted: Vec, +} + +const HIGHLIGHTEN_PREFIX: &str = ""; +const HIGHLIGHTEN_POSTFIX: &str = ""; + +impl Snippet { + + pub fn empty() -> Snippet { + Snippet { + fragments: String::new(), + highlighted: Vec::new() + } + } + + /// Returns a hignlightned html from the `Snippet`. + pub fn to_html(&self) -> String { + let mut html = String::new(); + let mut start_from: usize = 0; + + for item in self.highlighted.iter() { + html.push_str(&encode_minimal(&self.fragments[start_from..item.start])); + html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&encode_minimal(&self.fragments[item.start..item.stop])); + html.push_str(HIGHLIGHTEN_POSTFIX); + start_from = item.stop; + } + html.push_str(&encode_minimal( + &self.fragments[start_from..self.fragments.len()], + )); + html + } +} + +/// Returns a non-empty list of "good" fragments. +/// +/// If no target term is within the text, then the function +/// should return an empty Vec. +/// +/// If a target term is within the text, then the returned +/// list is required to be non-empty. +/// +/// The returned list is non-empty and contain less +/// than 12 possibly overlapping fragments. +/// +/// All fragments should contain at least one target term +/// and have at most `max_num_chars` characters (not bytes). +/// +/// It is ok to emit non-overlapping fragments, for instance, +/// one short and one long containing the same keyword, in order +/// to leave optimization opportunity to the fragment selector +/// upstream. +/// +/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ +/// has to be a valid string. +fn search_fragments<'a>( + tokenizer: &BoxedTokenizer, + text: &'a str, + terms: &BTreeMap, + max_num_chars: usize, +) -> Vec { + let mut token_stream = tokenizer.token_stream(text); + let mut fragment = FragmentCandidate::new(0); + let mut fragments: Vec = vec![]; + + while let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { + if fragment.score > 0.0 { + fragments.push(fragment) + }; + fragment = FragmentCandidate::new(next.offset_from); + } + fragment.try_add_token(next, &terms); + } + if fragment.score > 0.0 { + fragments.push(fragment) + } + + fragments +} + +/// Returns a Snippet +/// +/// Takes a vector of `FragmentCandidate`s and the text. +/// Figures out the best fragment from it and creates a snippet. +fn select_best_fragment_combination<'a>( + fragments: Vec, + text: &'a str, +) -> Snippet { + let best_fragment_opt = fragments + .iter() + .max_by(|left, right| { + let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal); + if cmp_score == Ordering::Equal { + (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) + } else { + cmp_score + } + }); + if let Some(fragment) = best_fragment_opt { + let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; + let highlighted = fragment + .highlighted + .iter() + .map(|item| { + HighlightSection::new( + item.start - fragment.start_offset, + item.stop - fragment.start_offset, + ) + }).collect(); + Snippet { + fragments: fragment_text.to_string(), + highlighted: highlighted, + } + } else { + // when there no fragments to chose from, + // for now create a empty snippet + Snippet { + fragments: String::new(), + highlighted: vec![], + } + } +} + +/// `SnippetGenerator` +/// +/// # Example +/// +/// ```rust +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::query::QueryParser; +/// use tantivy::SnippetGenerator; +/// +/// # fn main() -> tantivy::Result<()> { +/// # let mut schema_builder = SchemaBuilder::default(); +/// # let text_field = schema_builder.add_text_field("text", TEXT); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; +/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +/// # Je ne me sentis plus guidé par les haleurs : +/// # Des Peaux-Rouges criards les avaient pris pour cibles, +/// # Les ayant cloués nus aux poteaux de couleurs. +/// # +/// # J'étais insoucieux de tous les équipages, +/// # Porteur de blés flamands ou de cotons anglais. +/// # Quand avec mes haleurs ont fini ces tapages, +/// # Les Fleuves m'ont laissé descendre où je voulais. +/// # "#); +/// # index_writer.add_document(doc.clone()); +/// # index_writer.commit()?; +/// # let query_parser = QueryParser::for_index(&index, vec![text_field]); +/// // ... +/// let query = query_parser.parse_query("haleurs flamands").unwrap(); +/// # index.load_searchers()?; +/// # let searcher = index.searcher(); +/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?; +/// snippet_generator.set_max_num_chars(100); +/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet_html: String = snippet.to_html(); +/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); +/// # Ok(()) +/// # } +/// ``` +pub struct SnippetGenerator { + terms_text: BTreeMap, + tokenizer: Box, + field: Field, + max_num_chars: usize +} + +impl SnippetGenerator { + /// Creates a new snippet generator + pub fn new(searcher: &Searcher, + query: &Query, + field: Field) -> Result { + let mut terms = BTreeSet::new(); + query.query_terms(&mut terms); + let terms_text: BTreeMap = terms.into_iter() + .filter(|term| term.field() == field) + .map(|term| (term.text().to_string(), 1f32)) + .collect(); + let tokenizer = searcher.index().tokenizer_for_field(field)?; + Ok(SnippetGenerator { + terms_text, + tokenizer, + field, + max_num_chars: DEFAULT_MAX_NUM_CHARS + }) + } + + /// Sets a maximum number of chars. + pub fn set_max_num_chars(&mut self, max_num_chars: usize) { + self.max_num_chars = max_num_chars; + } + + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated to the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + let text: String = doc.get_all(self.field) + .into_iter() + .flat_map(|val| val.text()) + .collect::>() + .join(" "); + self.snippet(&text) + } + + /// Generates a snippet for the given text. + pub fn snippet(&self, text: &str) -> Snippet { + let fragment_candidates = search_fragments(&*self.tokenizer, + &text, + &self.terms_text, + self.max_num_chars); + select_best_fragment_combination(fragment_candidates, &text) + } +} + +#[cfg(test)] +mod tests { + use super::{search_fragments, select_best_fragment_combination}; + use std::collections::BTreeMap; + use std::iter::Iterator; + use tokenizer::{box_tokenizer, SimpleTokenizer}; + use Index; + use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; + use SnippetGenerator; + use query::QueryParser; + + + const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which +describes it as a "safe, concurrent, practical language", supporting functional and +imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], +but its designers intend it to provide better memory safety while still maintaining +performance. + +Rust is free and open-source software, released under an MIT License, or Apache License +2.0. Its designers have refined the language through the experiences of writing the Servo +web browser layout engine[14] and the Rust compiler. A large proportion of current commits +to the project are from community members.[15] + +Rust won first place for "most loved programming language" in the Stack Overflow Developer +Survey in 2016, 2017, and 2018."#; + + #[test] + fn test_snippet() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + let mut terms = BTreeMap::new(); + terms.insert(String::from("rust"), 1.0); + terms.insert(String::from("language"), 0.9); + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); + assert_eq!(fragments.len(), 7); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.9); + assert_eq!(first.stop_offset, 89); + } + let snippet = select_best_fragment_combination(fragments, &TEST_TEXT); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) + } + + #[test] + fn test_snippet_in_second_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("c"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 1); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.start_offset, 4); + assert_eq!(first.stop_offset, 7); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "c d"); + assert_eq!(snippet.to_html(), "c d"); + } + + #[test] + fn test_snippet_with_term_at_the_end_of_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 11); + assert_eq!(first.start_offset, 8); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f"); + assert_eq!(snippet.to_html(), "e f"); + } + + #[test] + fn test_snippet_with_second_fragment_has_the_highest_score() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + terms.insert(String::from("a"), 0.9); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 7); + assert_eq!(first.start_offset, 0); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f g"); + assert_eq!(snippet.to_html(), "e f g"); + } + + #[test] + fn test_snippet_with_term_not_in_text() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("z"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + #[test] + fn test_snippet_with_no_terms() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d"; + + let terms = BTreeMap::new(); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + #[test] + fn test_snippet_generator() { + let mut schema_builder = SchemaBuilder::default (); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic) + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc ! (text_field => TEST_TEXT); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + } + { + snippet_generator.set_max_num_chars(90); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + } + + } +} diff --git a/src/store/mod.rs b/src/store/mod.rs index 5d71563e1..7bce9085d 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,7 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text(), + *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(), format!("Doc {}", i) ); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 0b1c68339..9d94441ab 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -152,6 +152,8 @@ pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub use self::tokenizer::BoxedTokenizer; +pub(crate) use self::tokenizer::box_tokenizer; + pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 981962a66..001469f35 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use tokenizer::tokenizer::box_tokenizer; +use tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; use tokenizer::LowerCaser; use tokenizer::RawTokenizer;