Merge branch 'issue/368b'

2026-01-07 17:42:55 +00:00 · 2018-09-11 20:16:14 +09:00
parent a78aa4c259 cc23194c58
commit dd37e109f2
20 changed files with 667 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ Tantivy 0.7
  greatly improving performance
 - Tantivy error now rely on the failure crate (@drusellers)
 - Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax
-
+- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton)

 Tantivy 0.6.1
 =========================
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,7 @@ census = "0.1"
 fnv = "1.0.6"
 owned-read = "0.4"
 failure = "0.1"
+htmlescape = "0.3.1"
 fail = "0.2"

 [target.'cfg(windows)'.dependencies]
@@ -60,7 +61,6 @@ opt-level = 3
 debug = false
 lto = true
 debug-assertions = false
-overflow-checks = false

 [profile.test]
 debug-assertions = true
--- a/examples/snippet.rs
+++ b/examples/snippet.rs
@@ -0,0 +1,73 @@
+// # Snippet example
+//
+// This example shows how to return a representative snippet of
+// your hit result.
+// Snippet are an extracted of a target document, and returned in HTML format.
+// The keyword searched by the user are highlighted with a `<b>` tag.
+extern crate tempdir;
+
+// ---
+// Importing tantivy...
+#[macro_use]
+extern crate tantivy;
+use tantivy::collector::TopCollector;
+use tantivy::query::QueryParser;
+use tantivy::schema::*;
+use tantivy::Index;
+use tantivy::SnippetGenerator;
+use tempdir::TempDir;
+
+fn main() -> tantivy::Result<()> {
+    // Let's create a temporary directory for the
+    // sake of this example
+    let index_path = TempDir::new("tantivy_example_dir")?;
+
+    // # Defining the schema
+    let mut schema_builder = SchemaBuilder::default();
+    schema_builder.add_text_field("body", TEXT);
+    let schema = schema_builder.build();
+
+    // # Indexing documents
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    let mut index_writer = index.writer(50_000_000)?;
+
+    let title = schema.get_field("title").unwrap();
+    let body = schema.get_field("body").unwrap();
+
+    // we'll only need one doc for this example.
+    index_writer.add_document(doc!(
+        title => "Of Mice and Men",
+        body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+                over the yellow sands in the sunlight before reaching the narrow pool. On one \
+                side of the river the golden foothill slopes curve up to the strong and rocky \
+                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+                fresh and green with every spring, carrying in their lower leaf junctures the \
+                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+                limbs and branches that arch over the pool"
+    ));
+    // ...
+    index_writer.commit()?;
+
+    index.load_searchers()?;
+
+    let searcher = index.searcher();
+    let query_parser = QueryParser::for_index(&index, vec![title, body]);
+    let query = query_parser.parse_query("sycamore spring")?;
+
+    let mut top_collector = TopCollector::with_limit(10);
+    searcher.search(&*query, &mut top_collector)?;
+
+    let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?;
+
+    let doc_addresses = top_collector.docs();
+    for doc_address in doc_addresses {
+        let doc = searcher.doc(&doc_address)?;
+        let snippet = snippet_generator.snippet_from_doc(&doc);
+        println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
+        println!("snippet: {}", snippet.to_html());
+    }
+
+    Ok(())
+}
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -28,6 +28,9 @@ use num_cpus;
 use std::path::Path;
 use tokenizer::TokenizerManager;
 use IndexWriter;
+use schema::FieldType;
+use schema::Field;
+use tokenizer::BoxedTokenizer;

 fn load_metas(directory: &Directory) -> Result<IndexMeta> {
    let meta_data = directory.atomic_read(&META_FILEPATH)?;
@@ -112,6 +115,34 @@ impl Index {
        &self.tokenizers
    }

+
+    /// Helper to access the tokenizer associated to a specific field.
+    pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
+        let field_entry = self.schema.get_field_entry(field);
+        let field_type = field_entry.field_type();
+        let tokenizer_manager: &TokenizerManager = self.tokenizers();
+        let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
+            match field_type {
+                FieldType::Str(text_options) => {
+                    text_options
+                        .get_indexing_options()
+                        .map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
+                        .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
+                },
+                _ => {
+                    None
+                }
+            };
+        match tokenizer_name_opt {
+            Some(tokenizer) => {
+                Ok(tokenizer)
+            }
+            None => {
+                Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name())))
+            }
+        }
+    }
+
    /// Opens a new directory from an index path.
    #[cfg(feature = "mmap")]
    pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
@@ -258,7 +289,7 @@ impl Index {
        let schema = self.schema();
        let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
        let searchers = (0..num_searchers)
-            .map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
+            .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
            .collect();
        self.searcher_pool.publish_new_generation(searchers);
        Ok(())
@@ -296,3 +327,26 @@ impl Clone for Index {
        }
    }
 }
+
+
+#[cfg(test)]
+mod tests {
+    use Index;
+    use schema::{SchemaBuilder, TEXT, INT_INDEXED};
+
+    #[test]
+    fn test_indexer_for_field() {
+        let mut schema_builder = SchemaBuilder::default();
+        let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
+        let body_field = schema_builder.add_text_field("body", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        assert!(index.tokenizer_for_field(body_field).is_ok());
+        assert_eq!(
+            format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
+            "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
+        );
+    }
+
+
+}
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use termdict::TermMerger;
 use DocAddress;
 use Result;
+use Index;

 /// Holds a list of `SegmentReader`s ready for search.
 ///
@@ -18,17 +19,25 @@ use Result;
 ///
 pub struct Searcher {
    schema: Schema,
+    index: Index,
    segment_readers: Vec<SegmentReader>,
 }

 impl Searcher {
    /// Creates a new `Searcher`
-    pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
+    pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
        Searcher {
            schema,
+            index,
            segment_readers,
        }
    }
+
+    /// Returns the `Index` associated to the `Searcher`
+    pub fn index(&self) -> &Index {
+        &self.index
+    }
+
    /// Fetches a document from tantivy's store given a `DocAddress`.
    ///
    /// The searcher uses the segment ordinal to route the
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -770,23 +770,23 @@ mod tests {
            }
            {
                let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
-                assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
+                assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
            }
            {
                let doc = searcher.doc(&DocAddress(0, 1)).unwrap();
-                assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c");
+                assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
            }
            {
                let doc = searcher.doc(&DocAddress(0, 2)).unwrap();
-                assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d");
+                assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
            }
            {
                let doc = searcher.doc(&DocAddress(0, 3)).unwrap();
-                assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
+                assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
            }
            {
                let doc = searcher.doc(&DocAddress(0, 4)).unwrap();
-                assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g");
+                assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
            }
            {
                let get_fast_vals = |terms: Vec<Term>| {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -154,6 +154,7 @@ extern crate stable_deref_trait;
 extern crate tempdir;
 extern crate tempfile;
 extern crate uuid;
+extern crate htmlescape;

 #[cfg(test)]
 #[macro_use]
@@ -210,6 +211,9 @@ pub mod schema;
 pub mod store;
 pub mod termdict;

+mod snippet;
+pub use self::snippet::SnippetGenerator;
+
 mod docset;
 pub use self::docset::{DocSet, SkipResult};

@@ -893,11 +897,11 @@ mod tests {
        assert_eq!(document.len(), 3);
        let values = document.get_all(text_field);
        assert_eq!(values.len(), 2);
-        assert_eq!(values[0].text(), "tantivy");
-        assert_eq!(values[1].text(), "some other value");
+        assert_eq!(values[0].text(), Some("tantivy"));
+        assert_eq!(values[1].text(), Some("some other value"));
        let values = document.get_all(other_text_field);
        assert_eq!(values.len(), 1);
-        assert_eq!(values[0].text(), "short");
+        assert_eq!(values[0].text(), Some("short"));
    }

    #[test]
--- a/src/query/boolean_query/boolean_query.rs
+++ b/src/query/boolean_query/boolean_query.rs
@@ -6,6 +6,7 @@ use query::Weight;
 use schema::IndexRecordOption;
 use schema::Term;
 use Result;
+use std::collections::BTreeSet;
 use Searcher;

 /// The boolean query combines a set of queries
@@ -40,6 +41,7 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
 }

 impl Query for BooleanQuery {
+
    fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
        let sub_weights = self.subqueries
            .iter()
@@ -49,6 +51,12 @@ impl Query for BooleanQuery {
            .collect::<Result<_>>()?;
        Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
    }
+
+    fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
+        for (_occur, subquery) in &self.subqueries {
+            subquery.query_terms(term_set);
+        }
+    }
 }

 impl BooleanQuery {
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -27,7 +27,6 @@ mod weight;
 mod vec_docset;

 pub(crate) mod score_combiner;
-
 pub use self::intersection::Intersection;
 pub use self::union::Union;

--- a/src/query/phrase_query/phrase_query.rs
+++ b/src/query/phrase_query/phrase_query.rs
@@ -6,6 +6,7 @@ use query::Query;
 use query::Weight;
 use schema::{Field, Term};
 use Result;
+use std::collections::BTreeSet;

 /// `PhraseQuery` matches a specific sequence of words.
 ///
@@ -107,4 +108,10 @@ impl Query for PhraseQuery {
            )))
        }
    }
+
+    fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
+        for (_, query_term) in &self.phrase_terms {
+            term_set.insert(query_term.clone());
+        }
+    }
 }
--- a/src/query/phrase_query/phrase_weight.rs
+++ b/src/query/phrase_query/phrase_weight.rs
@@ -30,6 +30,7 @@ impl PhraseWeight {
 }

 impl Weight for PhraseWeight {
+
    fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
        let similarity_weight = self.similarity_weight.clone();
        let field = self.phrase_terms[0].1.field();
--- a/src/query/query.rs
+++ b/src/query/query.rs
@@ -5,6 +5,8 @@ use downcast;
 use std::fmt;
 use Result;
 use SegmentLocalId;
+use std::collections::BTreeSet;
+use Term;

 /// The `Query` trait defines a set of documents and a scoring method
 /// for those documents.
@@ -58,6 +60,10 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
        Ok(result)
    }

+    /// Extract all of the terms associated to the query and insert them in the
+    /// term set given in arguments.
+    fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
+
    /// Search works as follows :
    ///
    /// First the weight object associated to the query is created.
--- a/src/query/range_query.rs
+++ b/src/query/range_query.rs
@@ -274,6 +274,7 @@ impl RangeWeight {
 }

 impl Weight for RangeWeight {
+
    fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
        let max_doc = reader.max_doc();
        let mut doc_bitset = BitSet::with_max_value(max_doc);
--- a/src/query/term_query/term_query.rs
+++ b/src/query/term_query/term_query.rs
@@ -6,6 +6,7 @@ use schema::IndexRecordOption;
 use Result;
 use Searcher;
 use Term;
+use std::collections::BTreeSet;

 /// A Term query matches all of the documents
 /// containing a specific term.
@@ -110,4 +111,7 @@ impl Query for TermQuery {
    fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
        Ok(Box::new(self.specialized_weight(searcher, scoring_enabled)))
    }
+    fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
+        term_set.insert(self.term.clone());
+    }
 }
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -443,8 +443,8 @@ mod tests {
            }"#,
                )
                .unwrap();
-            assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
-            assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
+            assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
+            assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton"));
            assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
            assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
        }
--- a/src/schema/value.rs
+++ b/src/schema/value.rs
@@ -74,10 +74,10 @@ impl Value {
    ///
    /// # Panics
    /// If the value is not of type `Str`
-    pub fn text(&self) -> &str {
+    pub fn text(&self) -> Option<&str> {
        match *self {
-            Value::Str(ref text) => text,
-            _ => panic!("This is not a text field."),
+            Value::Str(ref text) => Some(text),
+            _ => None,
        }
    }

--- a/src/snippet/mod.rs
+++ b/src/snippet/mod.rs
@@ -0,0 +1,479 @@
+use htmlescape::encode_minimal;
+use std::collections::BTreeMap;
+use tokenizer::{Token, TokenStream};
+use Result;
+use query::Query;
+use Searcher;
+use schema::Field;
+use std::collections::BTreeSet;
+use tokenizer::BoxedTokenizer;
+use Document;
+use std::cmp::Ordering;
+
+const DEFAULT_MAX_NUM_CHARS: usize = 150;
+
+#[derive(Debug)]
+pub struct HighlightSection {
+    start: usize,
+    stop: usize,
+}
+
+impl HighlightSection {
+    fn new(start: usize, stop: usize) -> HighlightSection {
+        HighlightSection { start, stop }
+    }
+}
+
+#[derive(Debug)]
+pub struct FragmentCandidate {
+    score: f32,
+    start_offset: usize,
+    stop_offset: usize,
+    num_chars: usize,
+    highlighted: Vec<HighlightSection>,
+}
+
+impl FragmentCandidate {
+    /// Create a basic `FragmentCandidate`
+    ///
+    /// `score`, `num_chars` are set to 0
+    /// and `highlighted` is set to empty vec
+    /// stop_offset is set to start_offset, which is taken as a param.
+    fn new(start_offset: usize) -> FragmentCandidate {
+        FragmentCandidate {
+            score: 0.0,
+            start_offset: start_offset,
+            stop_offset: start_offset,
+            num_chars: 0,
+            highlighted: vec![],
+        }
+    }
+
+    /// Updates `score` and `highlighted` fields of the objects.
+    ///
+    /// taking the token and terms, the token is added to the fragment.
+    /// if the token is one of the terms, the score
+    /// and highlighted fields are updated in the fragment.
+    fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
+        self.stop_offset = token.offset_to;
+
+        if let Some(score) = terms.get(&token.text.to_lowercase()) {
+            self.score += score;
+            self.highlighted
+                .push(HighlightSection::new(token.offset_from, token.offset_to));
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct Snippet {
+    fragments: String,
+    highlighted: Vec<HighlightSection>,
+}
+
+const HIGHLIGHTEN_PREFIX: &str = "<b>";
+const HIGHLIGHTEN_POSTFIX: &str = "</b>";
+
+impl Snippet {
+
+    pub fn empty() -> Snippet {
+        Snippet {
+            fragments: String::new(),
+            highlighted: Vec::new()
+        }
+    }
+
+    /// Returns a hignlightned html from the `Snippet`.
+    pub fn to_html(&self) -> String {
+        let mut html = String::new();
+        let mut start_from: usize = 0;
+
+        for item in self.highlighted.iter() {
+            html.push_str(&encode_minimal(&self.fragments[start_from..item.start]));
+            html.push_str(HIGHLIGHTEN_PREFIX);
+            html.push_str(&encode_minimal(&self.fragments[item.start..item.stop]));
+            html.push_str(HIGHLIGHTEN_POSTFIX);
+            start_from = item.stop;
+        }
+        html.push_str(&encode_minimal(
+            &self.fragments[start_from..self.fragments.len()],
+        ));
+        html
+    }
+}
+
+/// Returns a non-empty list of "good" fragments.
+///
+/// If no target term is within the text, then the function
+/// should return an empty Vec.
+///
+/// If a target term is within the text, then the returned
+/// list is required to be non-empty.
+///
+/// The returned list is non-empty and contain less
+/// than 12 possibly overlapping fragments.
+///
+/// All fragments should contain at least one target term
+/// and have at most `max_num_chars` characters (not bytes).
+///
+/// It is ok to emit non-overlapping fragments, for instance,
+/// one short and one long containing the same keyword, in order
+/// to leave optimization opportunity to the fragment selector
+/// upstream.
+///
+/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
+/// has to be a valid string.
+fn search_fragments<'a>(
+    tokenizer: &BoxedTokenizer,
+    text: &'a str,
+    terms: &BTreeMap<String, f32>,
+    max_num_chars: usize,
+) -> Vec<FragmentCandidate> {
+    let mut token_stream = tokenizer.token_stream(text);
+    let mut fragment = FragmentCandidate::new(0);
+    let mut fragments: Vec<FragmentCandidate> = vec![];
+
+    while let Some(next) = token_stream.next() {
+        if (next.offset_to - fragment.start_offset) > max_num_chars {
+            if fragment.score > 0.0 {
+                fragments.push(fragment)
+            };
+            fragment = FragmentCandidate::new(next.offset_from);
+        }
+        fragment.try_add_token(next, &terms);
+    }
+    if fragment.score > 0.0 {
+        fragments.push(fragment)
+    }
+
+    fragments
+}
+
+/// Returns a Snippet
+///
+/// Takes a vector of `FragmentCandidate`s and the text.
+/// Figures out the best fragment from it and creates a snippet.
+fn select_best_fragment_combination<'a>(
+    fragments: Vec<FragmentCandidate>,
+    text: &'a str,
+) -> Snippet {
+    let best_fragment_opt = fragments
+        .iter()
+        .max_by(|left, right| {
+            let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal);
+            if cmp_score == Ordering::Equal {
+                (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
+            } else {
+                cmp_score
+            }
+        });
+    if let Some(fragment) = best_fragment_opt {
+        let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
+        let highlighted = fragment
+            .highlighted
+            .iter()
+            .map(|item| {
+                HighlightSection::new(
+                    item.start - fragment.start_offset,
+                    item.stop - fragment.start_offset,
+                )
+            }).collect();
+        Snippet {
+            fragments: fragment_text.to_string(),
+            highlighted: highlighted,
+        }
+    } else {
+        // when there no fragments to chose from,
+        // for now create a empty snippet
+        Snippet {
+            fragments: String::new(),
+            highlighted: vec![],
+        }
+    }
+}
+
+/// `SnippetGenerator`
+///
+/// # Example
+///
+/// ```rust
+/// # #[macro_use]
+/// # extern crate tantivy;
+/// # use tantivy::Index;
+/// # use tantivy::schema::{SchemaBuilder, TEXT};
+/// # use tantivy::query::QueryParser;
+/// use tantivy::SnippetGenerator;
+///
+/// # fn main() -> tantivy::Result<()> {
+/// #    let mut schema_builder = SchemaBuilder::default();
+/// #    let text_field = schema_builder.add_text_field("text", TEXT);
+/// #    let schema = schema_builder.build();
+/// #    let index = Index::create_in_ram(schema);
+/// #    let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
+/// #    let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
+/// #   Je ne me sentis plus guidé par les haleurs :
+/// #  Des Peaux-Rouges criards les avaient pris pour cibles,
+/// #  Les ayant cloués nus aux poteaux de couleurs.
+/// #
+/// #  J'étais insoucieux de tous les équipages,
+/// #  Porteur de blés flamands ou de cotons anglais.
+/// #  Quand avec mes haleurs ont fini ces tapages,
+/// #  Les Fleuves m'ont laissé descendre où je voulais.
+/// #  "#);
+/// #    index_writer.add_document(doc.clone());
+/// #    index_writer.commit()?;
+/// #    let query_parser = QueryParser::for_index(&index, vec![text_field]);
+/// // ...
+/// let query = query_parser.parse_query("haleurs flamands").unwrap();
+/// # index.load_searchers()?;
+/// # let searcher = index.searcher();
+/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?;
+/// snippet_generator.set_max_num_chars(100);
+/// let snippet = snippet_generator.snippet_from_doc(&doc);
+/// let snippet_html: String = snippet.to_html();
+/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n  Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
+/// #    Ok(())
+/// # }
+/// ```
+pub struct SnippetGenerator {
+    terms_text: BTreeMap<String, f32>,
+    tokenizer: Box<BoxedTokenizer>,
+    field: Field,
+    max_num_chars: usize
+}
+
+impl SnippetGenerator {
+    /// Creates a new snippet generator
+    pub fn new(searcher: &Searcher,
+               query: &Query,
+               field: Field) -> Result<SnippetGenerator> {
+        let mut terms = BTreeSet::new();
+        query.query_terms(&mut terms);
+        let terms_text: BTreeMap<String, f32>  = terms.into_iter()
+            .filter(|term| term.field() == field)
+            .map(|term| (term.text().to_string(), 1f32))
+            .collect();
+        let tokenizer = searcher.index().tokenizer_for_field(field)?;
+        Ok(SnippetGenerator {
+            terms_text,
+            tokenizer,
+            field,
+            max_num_chars: DEFAULT_MAX_NUM_CHARS
+        })
+    }
+
+    /// Sets a maximum number of chars.
+    pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
+        self.max_num_chars = max_num_chars;
+    }
+
+    /// Generates a snippet for the given `Document`.
+    ///
+    /// This method extract the text associated to the `SnippetGenerator`'s field
+    /// and computes a snippet.
+    pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
+        let text: String = doc.get_all(self.field)
+            .into_iter()
+            .flat_map(|val| val.text())
+            .collect::<Vec<&str>>()
+            .join(" ");
+        self.snippet(&text)
+    }
+
+    /// Generates a snippet for the given text.
+    pub fn snippet(&self, text: &str) -> Snippet {
+        let fragment_candidates = search_fragments(&*self.tokenizer,
+                                                   &text,
+                                                   &self.terms_text,
+                                                   self.max_num_chars);
+        select_best_fragment_combination(fragment_candidates, &text)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{search_fragments, select_best_fragment_combination};
+    use std::collections::BTreeMap;
+    use std::iter::Iterator;
+    use tokenizer::{box_tokenizer, SimpleTokenizer};
+    use Index;
+    use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
+    use SnippetGenerator;
+    use query::QueryParser;
+
+
+    const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which
+describes it as a "safe, concurrent, practical language", supporting functional and
+imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],
+but its designers intend it to provide better memory safety while still maintaining
+performance.
+
+Rust is free and open-source software, released under an MIT License, or Apache License
+2.0. Its designers have refined the language through the experiences of writing the Servo
+web browser layout engine[14] and the Rust compiler. A large proportion of current commits
+to the project are from community members.[15]
+
+Rust won first place for "most loved programming language" in the Stack Overflow Developer
+Survey in 2016, 2017, and 2018."#;
+
+    #[test]
+    fn test_snippet() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+        let mut terms = BTreeMap::new();
+        terms.insert(String::from("rust"), 1.0);
+        terms.insert(String::from("language"), 0.9);
+        let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
+        assert_eq!(fragments.len(), 7);
+        {
+            let first = fragments.iter().nth(0).unwrap();
+            assert_eq!(first.score, 1.9);
+            assert_eq!(first.stop_offset, 89);
+        }
+        let snippet = select_best_fragment_combination(fragments, &TEST_TEXT);
+        assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned());
+        assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems programming <b>language</b> sponsored by Mozilla which\ndescribes it as a &quot;safe".to_owned())
+    }
+
+    #[test]
+    fn test_snippet_in_second_fragment() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+
+        let text = "a b c d e f g";
+
+        let mut terms = BTreeMap::new();
+        terms.insert(String::from("c"), 1.0);
+
+        let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
+
+        assert_eq!(fragments.len(), 1);
+        {
+            let first = fragments.iter().nth(0).unwrap();
+            assert_eq!(first.score, 1.0);
+            assert_eq!(first.start_offset, 4);
+            assert_eq!(first.stop_offset, 7);
+        }
+
+        let snippet = select_best_fragment_combination(fragments, &text);
+        assert_eq!(snippet.fragments, "c d");
+        assert_eq!(snippet.to_html(), "<b>c</b> d");
+    }
+
+    #[test]
+    fn test_snippet_with_term_at_the_end_of_fragment() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+
+        let text = "a b c d e f f g";
+
+        let mut terms = BTreeMap::new();
+        terms.insert(String::from("f"), 1.0);
+
+        let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
+
+        assert_eq!(fragments.len(), 2);
+        {
+            let first = fragments.iter().nth(0).unwrap();
+            assert_eq!(first.score, 1.0);
+            assert_eq!(first.stop_offset, 11);
+            assert_eq!(first.start_offset, 8);
+        }
+
+        let snippet = select_best_fragment_combination(fragments, &text);
+        assert_eq!(snippet.fragments, "e f");
+        assert_eq!(snippet.to_html(), "e <b>f</b>");
+    }
+
+    #[test]
+    fn test_snippet_with_second_fragment_has_the_highest_score() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+
+        let text = "a b c d e f g";
+
+        let mut terms = BTreeMap::new();
+        terms.insert(String::from("f"), 1.0);
+        terms.insert(String::from("a"), 0.9);
+
+        let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
+
+        assert_eq!(fragments.len(), 2);
+        {
+            let first = fragments.iter().nth(0).unwrap();
+            assert_eq!(first.score, 0.9);
+            assert_eq!(first.stop_offset, 7);
+            assert_eq!(first.start_offset, 0);
+        }
+
+        let snippet = select_best_fragment_combination(fragments, &text);
+        assert_eq!(snippet.fragments, "e f g");
+        assert_eq!(snippet.to_html(), "e <b>f</b> g");
+    }
+
+    #[test]
+    fn test_snippet_with_term_not_in_text() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+
+        let text = "a b c d";
+
+        let mut terms = BTreeMap::new();
+        terms.insert(String::from("z"), 1.0);
+
+        let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
+
+        assert_eq!(fragments.len(), 0);
+
+        let snippet = select_best_fragment_combination(fragments, &text);
+        assert_eq!(snippet.fragments, "");
+        assert_eq!(snippet.to_html(), "");
+    }
+
+    #[test]
+    fn test_snippet_with_no_terms() {
+        let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
+
+        let text = "a b c d";
+
+        let  terms = BTreeMap::new();
+        let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
+        assert_eq!(fragments.len(), 0);
+
+        let snippet = select_best_fragment_combination(fragments, &text);
+        assert_eq!(snippet.fragments, "");
+        assert_eq!(snippet.to_html(), "");
+    }
+
+    #[test]
+    fn test_snippet_generator() {
+        let mut schema_builder = SchemaBuilder::default ();
+        let text_options = TextOptions::default()
+            .set_indexing_options(TextFieldIndexing::default()
+                .set_tokenizer("en_stem")
+                .set_index_option(IndexRecordOption::Basic)
+            );
+        let text_field = schema_builder.add_text_field("text", text_options);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            // writing the segment
+            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
+            {
+                let doc = doc ! (text_field => TEST_TEXT);
+                index_writer.add_document(doc);
+            }
+            index_writer.commit().unwrap();
+        }
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let query_parser = QueryParser::for_index(&index, vec![text_field]);
+        let query = query_parser.parse_query("rust design").unwrap();
+        let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap();
+        {
+            let snippet = snippet_generator.snippet(TEST_TEXT);
+            assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
+        }
+        {
+            snippet_generator.set_max_num_chars(90);
+            let snippet = snippet_generator.snippet(TEST_TEXT);
+            assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
+        }
+
+    }
+}
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -109,7 +109,7 @@ pub mod tests {
        let store = StoreReader::from_source(store_source);
        for i in 0..1_000 {
            assert_eq!(
-                *store.get(i).unwrap().get_first(field_title).unwrap().text(),
+                *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(),
                format!("Doc {}", i)
            );
        }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -152,6 +152,8 @@ pub use self::stemmer::Stemmer;
 pub use self::stop_word_filter::StopWordFilter;
 pub(crate) use self::token_stream_chain::TokenStreamChain;
 pub use self::tokenizer::BoxedTokenizer;
+pub(crate) use self::tokenizer::box_tokenizer;
+
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
 pub use self::tokenizer_manager::TokenizerManager;

--- a/src/tokenizer/tokenizer_manager.rs
+++ b/src/tokenizer/tokenizer_manager.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 use std::sync::{Arc, RwLock};
-use tokenizer::tokenizer::box_tokenizer;
+use tokenizer::box_tokenizer;
 use tokenizer::BoxedTokenizer;
 use tokenizer::LowerCaser;
 use tokenizer::RawTokenizer;