From a12d211330657931de2c972030504762cdbb8432 Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Thu, 30 Aug 2018 09:23:34 +0900
Subject: [PATCH] Extracting terms matching query in the document

---
 examples/snippet.rs                 | 75 +++++++++++++++++++++++++++++
 src/common/mod.rs                   | 21 +++++++-
 src/query/automaton_weight.rs       | 47 ++++++++++++++++++
 src/query/term_query/term_weight.rs | 22 +++++++++
 src/query/weight.rs                 | 35 ++++++++++++++
 src/snippet/mod.rs                  |  6 +--
 6 files changed, 201 insertions(+), 5 deletions(-)
 create mode 100644 examples/snippet.rs
diff --git a/examples/snippet.rs b/examples/snippet.rs
new file mode 100644
index 000000000..35e9e76bd
--- /dev/null
+++ b/examples/snippet.rs
@@ -0,0 +1,75 @@
+// # Snippet example
+//
+// This example shows how to return a representative snippet of
+// your hit result.
+// Snippet are an extracted of a target document, and returned in HTML format.
+// The keyword searched by the user are highlighted with a `<b>` tag.
+extern crate tempdir;
+
+// ---
+// Importing tantivy...
+#[macro_use]
+extern crate tantivy;
+use tantivy::collector::TopCollector;
+use tantivy::query::QueryParser;
+use tantivy::schema::*;
+use tantivy::Index;
+
+fn main() -> tantivy::Result<()> {
+    // Let's create a temporary directory for the
+    // sake of this example
+    let index_path = TempDir::new("tantivy_example_dir")?;
+
+    // # Defining the schema
+    let mut schema_builder = SchemaBuilder::default();
+    schema_builder.add_text_field("body", TEXT);
+    let schema = schema_builder.build();
+
+    // # Indexing documents
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    let mut index_writer = index.writer(50_000_000)?;
+
+    let title = schema.get_field("title").unwrap();
+    let body = schema.get_field("body").unwrap();
+
+    let mut old_man_doc = Document::default();
+    // we'll only need one doc for this example.
+    index_writer.add_document(doc!(
+        title => "Of Mice and Men",
+        body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+                over the yellow sands in the sunlight before reaching the narrow pool. On one \
+                side of the river the golden foothill slopes curve up to the strong and rocky \
+                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+                fresh and green with every spring, carrying in their lower leaf junctures the \
+                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+                limbs and branches that arch over the pool"
+    ));
+    // ...
+    index_writer.commit()?;
+
+    index.load_searchers()?;
+
+    let searcher = index.searcher();
+    let query_parser = QueryParser::for_index(&index, vec![title, body]);
+
+    let query = query_parser.parse_query("sycamore spring")?;
+
+    let mut top_collector = TopCollector::with_limit(10);
+
+    searcher.search(&*query, &mut top_collector)?;
+
+    let doc_addresses = top_collector.docs();
+
+    for doc_address in doc_addresses {
+        let retrieved_doc = searcher.doc(&doc_address)?;
+        generate_snippet(&retrieved_doc, query
+    }
+
+
+    Ok(())
+}
+
+
+use tempdir::TempDir;
diff --git a/src/common/mod.rs b/src/common/mod.rs
index 2942438b4..778f0476a 100644
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -68,6 +68,17 @@ pub trait HasLen {
     }
 }
 
+
+pub fn is_stricly_sorted<T: Ord>(els: &[T]) -> bool {
+    if els.is_empty() {
+        true
+    } else {
+        els.iter()
+            .zip(els[1..].iter())
+            .all(|(left, right)| left < right)
+    }
+}
+
 const HIGHEST_BIT: u64 = 1 << 63;
 
 /// Maps a `i64` to `u64`
@@ -105,12 +116,20 @@ pub fn u64_to_i64(val: u64) -> i64 {
 pub(crate) mod test {
 
     pub use super::serialize::test::fixed_size_test;
-    use super::{compute_num_bits, i64_to_u64, u64_to_i64};
+    use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted};
 
     fn test_i64_converter_helper(val: i64) {
         assert_eq!(u64_to_i64(i64_to_u64(val)), val);
     }
 
+
+    #[test]
+    fn test_is_strictly_sorted() {
+        assert!(is_stricly_sorted::<u32>(&[]));
+        assert!(is_stricly_sorted(&[1]));
+        assert!(is_stricly_sorted(&[1, 2, 3]));
+        assert!(!is_stricly_sorted(&[1, 3, 2]));
+    }
     #[test]
     fn test_i64_converter() {
         assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs
index b38e6592d..d1040eb85 100644
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -7,6 +7,11 @@ use query::{Scorer, Weight};
 use schema::{Field, IndexRecordOption};
 use termdict::{TermDictionary, TermStreamer};
 use Result;
+use query::weight::MatchingTerms;
+use SkipResult;
+use Term;
+use DocId;
+use DocSet;
 
 /// A weight struct for Fuzzy Term and Regex Queries
 pub struct AutomatonWeight<A>
@@ -36,6 +41,48 @@ impl<A> Weight for AutomatonWeight<A>
 where
     A: Automaton,
 {
+
+    fn matching_terms(&self,
+                      reader: &SegmentReader,
+                      matching_terms: &mut MatchingTerms) -> Result<()> {
+        let max_doc = reader.max_doc();
+        let mut doc_bitset = BitSet::with_max_value(max_doc);
+
+        let inverted_index = reader.inverted_index(self.field);
+        let term_dict = inverted_index.terms();
+        let mut term_stream = self.automaton_stream(term_dict);
+
+        let doc_ids = matching_terms.sorted_doc_ids();
+        let mut docs_matching_current_term: Vec<DocId> = vec![];
+
+        let mut term_buffer: Vec<u8> = vec![];
+
+        while term_stream.advance() {
+            docs_matching_current_term.clear();
+            let term_info = term_stream.value();
+            let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic);
+            for &doc_id in &doc_ids {
+                match segment_postings.skip_next(doc_id) {
+                    SkipResult::Reached => {
+                        docs_matching_current_term.push(doc_id);
+                    }
+                    SkipResult::OverStep => {}
+                    SkipResult::End => {}
+                }
+            }
+            if !docs_matching_current_term.is_empty() {
+                term_buffer.clear();
+                let term_ord = term_stream.term_ord();
+                inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
+                let term = Term::from_field_bytes(self.field, &term_buffer[..]);
+                for &doc_id in &docs_matching_current_term {
+                    matching_terms.add_term(doc_id, term.clone());
+                }
+            }
+        }
+        Ok(())
+    }
+
     fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
         let max_doc = reader.max_doc();
         let mut doc_bitset = BitSet::with_max_value(max_doc);
diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs
index ba45a8042..1a9075b5a 100644
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -8,6 +8,8 @@ use query::Weight;
 use schema::IndexRecordOption;
 use Result;
 use Term;
+use SkipResult;
+use query::weight::MatchingTerms;
 
 pub struct TermWeight {
     term: Term,
@@ -38,6 +40,26 @@ impl Weight for TermWeight {
         }
     }
 
+
+    fn matching_terms(&self,
+                      reader: &SegmentReader,
+                      matching_terms: &mut MatchingTerms) -> Result<()> {
+        let doc_ids = matching_terms.sorted_doc_ids();
+        let mut scorer = self.scorer(reader)?;
+        for doc_id in doc_ids {
+            match scorer.skip_next(doc_id) {
+                SkipResult::Reached => {
+                    matching_terms.add_term(doc_id, self.term.clone());
+                }
+                SkipResult::OverStep => {}
+                SkipResult::End => {
+                    break;
+                }
+            }
+        }
+        Ok(())
+    }
+
     fn count(&self, reader: &SegmentReader) -> Result<u32> {
         if reader.num_deleted_docs() == 0 {
             let field = self.term.field();
diff --git a/src/query/weight.rs b/src/query/weight.rs
index d3d8b3520..51289c573 100644
--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -1,6 +1,37 @@
 use super::Scorer;
 use core::SegmentReader;
 use Result;
+use DocId;
+use std::collections::HashSet;
+use Term;
+use std::collections::BTreeMap;
+
+
+pub struct MatchingTerms {
+    doc_to_terms: BTreeMap<DocId, HashSet<Term>>
+}
+
+impl MatchingTerms {
+    pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms {
+        MatchingTerms {
+            doc_to_terms: doc_ids
+                .iter()
+                .cloned()
+                .map(|doc_id| (doc_id, HashSet::default()))
+                .collect()
+        }
+    }
+
+    pub fn sorted_doc_ids(&self) -> Vec<DocId> {
+        self.doc_to_terms.keys().cloned().collect()
+    }
+
+    pub fn add_term(&mut self, doc_id: DocId, term: Term) {
+        if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
+            terms.insert(term);
+        }
+    }
+}
 
 /// A Weight is the specialization of a Query
 /// for a given set of segments.
@@ -11,6 +42,10 @@ pub trait Weight {
     /// See [`Query`](./trait.Query.html).
     fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
 
+    fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> {
+        Ok(())
+    }
+
     /// Returns the number documents within the given `SegmentReader`.
     fn count(&self, reader: &SegmentReader) -> Result<u32> {
         Ok(self.scorer(reader)?.count())
diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs
index 8142c54a0..97c557e98 100644
--- a/src/snippet/mod.rs
+++ b/src/snippet/mod.rs
@@ -1,10 +1,8 @@
-use htmlescape::encode_minimal;
+    use htmlescape::encode_minimal;
 use schema::FieldValue;
-use schema::Value;
 use std::collections::BTreeMap;
 use tokenizer::BoxedTokenizer;
-use tokenizer::{Token, TokenStream, Tokenizer};
-use Document;
+use tokenizer::{Token, TokenStream};
 use Index;
 use Term;