From a12d211330657931de2c972030504762cdbb8432 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 30 Aug 2018 09:23:34 +0900 Subject: [PATCH] Extracting terms matching query in the document --- examples/snippet.rs | 75 +++++++++++++++++++++++++++++ src/common/mod.rs | 21 +++++++- src/query/automaton_weight.rs | 47 ++++++++++++++++++ src/query/term_query/term_weight.rs | 22 +++++++++ src/query/weight.rs | 35 ++++++++++++++ src/snippet/mod.rs | 6 +-- 6 files changed, 201 insertions(+), 5 deletions(-) create mode 100644 examples/snippet.rs diff --git a/examples/snippet.rs b/examples/snippet.rs new file mode 100644 index 000000000..35e9e76bd --- /dev/null +++ b/examples/snippet.rs @@ -0,0 +1,75 @@ +// # Snippet example +// +// This example shows how to return a representative snippet of +// your hit result. +// Snippet are an extracted of a target document, and returned in HTML format. +// The keyword searched by the user are highlighted with a `` tag. +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_example_dir")?; + + // # Defining the schema + let mut schema_builder = SchemaBuilder::default(); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let mut old_man_doc = Document::default(); + // we'll only need one doc for this example. + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + // ... + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + let query = query_parser.parse_query("sycamore spring")?; + + let mut top_collector = TopCollector::with_limit(10); + + searcher.search(&*query, &mut top_collector)?; + + let doc_addresses = top_collector.docs(); + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + generate_snippet(&retrieved_doc, query + } + + + Ok(()) +} + + +use tempdir::TempDir; diff --git a/src/common/mod.rs b/src/common/mod.rs index 2942438b4..778f0476a 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,6 +68,17 @@ pub trait HasLen { } } + +pub fn is_stricly_sorted(els: &[T]) -> bool { + if els.is_empty() { + true + } else { + els.iter() + .zip(els[1..].iter()) + .all(|(left, right)| left < right) + } +} + const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -105,12 +116,20 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } + + #[test] + fn test_is_strictly_sorted() { + assert!(is_stricly_sorted::(&[])); + assert!(is_stricly_sorted(&[1])); + assert!(is_stricly_sorted(&[1, 2, 3])); + assert!(!is_stricly_sorted(&[1, 3, 2])); + } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index b38e6592d..d1040eb85 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,6 +7,11 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::weight::MatchingTerms; +use SkipResult; +use Term; +use DocId; +use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight @@ -36,6 +41,48 @@ impl Weight for AutomatonWeight where A: Automaton, { + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let max_doc = reader.max_doc(); + let mut doc_bitset = BitSet::with_max_value(max_doc); + + let inverted_index = reader.inverted_index(self.field); + let term_dict = inverted_index.terms(); + let mut term_stream = self.automaton_stream(term_dict); + + let doc_ids = matching_terms.sorted_doc_ids(); + let mut docs_matching_current_term: Vec = vec![]; + + let mut term_buffer: Vec = vec![]; + + while term_stream.advance() { + docs_matching_current_term.clear(); + let term_info = term_stream.value(); + let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); + for &doc_id in &doc_ids { + match segment_postings.skip_next(doc_id) { + SkipResult::Reached => { + docs_matching_current_term.push(doc_id); + } + SkipResult::OverStep => {} + SkipResult::End => {} + } + } + if !docs_matching_current_term.is_empty() { + term_buffer.clear(); + let term_ord = term_stream.term_ord(); + inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); + let term = Term::from_field_bytes(self.field, &term_buffer[..]); + for &doc_id in &docs_matching_current_term { + matching_terms.add_term(doc_id, term.clone()); + } + } + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index ba45a8042..1a9075b5a 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,6 +8,8 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; +use SkipResult; +use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -38,6 +40,26 @@ impl Weight for TermWeight { } } + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let doc_ids = matching_terms.sorted_doc_ids(); + let mut scorer = self.scorer(reader)?; + for doc_id in doc_ids { + match scorer.skip_next(doc_id) { + SkipResult::Reached => { + matching_terms.add_term(doc_id, self.term.clone()); + } + SkipResult::OverStep => {} + SkipResult::End => { + break; + } + } + } + Ok(()) + } + fn count(&self, reader: &SegmentReader) -> Result { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index d3d8b3520..51289c573 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,6 +1,37 @@ use super::Scorer; use core::SegmentReader; use Result; +use DocId; +use std::collections::HashSet; +use Term; +use std::collections::BTreeMap; + + +pub struct MatchingTerms { + doc_to_terms: BTreeMap> +} + +impl MatchingTerms { + pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { + MatchingTerms { + doc_to_terms: doc_ids + .iter() + .cloned() + .map(|doc_id| (doc_id, HashSet::default())) + .collect() + } + } + + pub fn sorted_doc_ids(&self) -> Vec { + self.doc_to_terms.keys().cloned().collect() + } + + pub fn add_term(&mut self, doc_id: DocId, term: Term) { + if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { + terms.insert(term); + } + } +} /// A Weight is the specialization of a Query /// for a given set of segments. @@ -11,6 +42,10 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + Ok(()) + } + /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result { Ok(self.scorer(reader)?.count()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8142c54a0..97c557e98 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,10 +1,8 @@ -use htmlescape::encode_minimal; + use htmlescape::encode_minimal; use schema::FieldValue; -use schema::Value; use std::collections::BTreeMap; use tokenizer::BoxedTokenizer; -use tokenizer::{Token, TokenStream, Tokenizer}; -use Document; +use tokenizer::{Token, TokenStream}; use Index; use Term;