Simple Implementation of NGram Tokenizer (#278)

* Simple Implementation of NGram Tokenizer It does not yet support edges It could probably be better in many "rusty" ways But the test is passing, so I'll call this a good stopping point for the day. * Remove Ngram from manager. Too many variations * Basic configuration model Should the extensive tests exist here? * Add Sample to provide an End to End testing * Basic Edgegram support * cleanup * code feedback * More code review feedback processed
2026-01-07 01:32:53 +00:00 · 2018-05-06 11:47:49 -05:00
parent 68ee18e4e8
commit ca74c14647
5 changed files with 521 additions and 21 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,3 +67,6 @@ travis-ci = { repository = "tantivy-search/tantivy" }
 [[example]]
 name = "simple_search"
 required-features = ["mmap"]
 [[example]]
 name = "custom_tokenizer"
--- a/examples/custom_tokenizer.rs
+++ b/examples/custom_tokenizer.rs
@@ -0,0 +1,226 @@
 extern crate tantivy;
 extern crate tempdir;
 #[macro_use]
 extern crate serde_json;
 use std::path::Path;
 use tantivy::collector::TopCollector;
 use tantivy::query::QueryParser;
 use tantivy::schema::*;
 use tantivy::tokenizer::NgramTokenizer;
 use tantivy::Index;
 use tempdir::TempDir;
 fn main() {
  // Let's create a temporary directory for the
  // sake of this example
  if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
    run_example(dir.path()).unwrap();
    dir.close().unwrap();
  }
 }
 fn run_example(index_path: &Path) -> tantivy::Result<()> {
  // # Defining the schema
  //
  // The Tantivy index requires a very strict schema.
  // The schema declares which fields are in the index,
  // and for each field, its type and "the way it should
  // be indexed".
  // first we need to define a schema ...
  let mut schema_builder = SchemaBuilder::default();
  // Our first field is title.
  // In this example we want to use NGram searching
  // we will set that to 3 characters, so any three
  // char in the title should be findable.
  let text_field_indexing = TextFieldIndexing::default()
    .set_tokenizer("ngram3")
    .set_index_option(IndexRecordOption::WithFreqsAndPositions);
  let text_options = TextOptions::default()
    .set_indexing_options(text_field_indexing)
    .set_stored();
  schema_builder.add_text_field("title", text_options);
  // Our second field is body.
  // We want full-text search for it, but we do not
  // need to be able to be able to retrieve it
  // for our application.
  //
  // We can make our index lighter and
  // by omitting `STORED` flag.
  schema_builder.add_text_field("body", TEXT);
  let schema = schema_builder.build();
  // # Indexing documents
  //
  // Let's create a brand new index.
  //
  // This will actually just save a meta.json
  // with our schema in the directory.
  let index = Index::create(index_path, schema.clone())?;
  // here we are registering our custome tokenizer
  // this will store tokens of 3 characters each
  index
    .tokenizers()
    .register("ngram3", NgramTokenizer::new(3, 3, false));
  // To insert document we need an index writer.
  // There must be only one writer at a time.
  // This single `IndexWriter` is already
  // multithreaded.
  //
  // Here we use a buffer of 50MB per thread. Using a bigger
  // heap for the indexer can increase its throughput.
  let mut index_writer = index.writer(50_000_000)?;
  // Let's index our documents!
  // We first need a handle on the title and the body field.
  // ### Create a document "manually".
  //
  // We can create a document manually, by setting the fields
  // one by one in a Document object.
  let title = schema.get_field("title").unwrap();
  let body = schema.get_field("body").unwrap();
  let mut old_man_doc = Document::default();
  old_man_doc.add_text(title, "The Old Man and the Sea");
  old_man_doc.add_text(
    body,
    "He was an old man who fished alone in a skiff in the Gulf Stream and \
     he had gone eighty-four days now without taking a fish.",
  );
  // ... and add it to the `IndexWriter`.
  index_writer.add_document(old_man_doc);
  // ### Create a document directly from json.
  //
  // Alternatively, we can use our schema to parse a
  // document object directly from json.
  // The document is a string, but we use the `json` macro
  // from `serde_json` for the convenience of multi-line support.
  let json = json!({
       "title": "Of Mice and Men",
       "body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
                over the yellow sands in the sunlight before reaching the narrow pool. On one \
                side of the river the golden foothill slopes curve up to the strong and rocky \
                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
                fresh and green with every spring, carrying in their lower leaf junctures the \
                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
                limbs and branches that arch over the pool"
    });
  let mice_and_men_doc = schema.parse_document(&json.to_string())?;
  index_writer.add_document(mice_and_men_doc);
  // Multi-valued field are allowed, they are
  // expressed in JSON by an array.
  // The following document has two titles.
  let json = json!({
       "title": ["Frankenstein", "The Modern Prometheus"],
       "body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
                enterprise which you have regarded with such evil forebodings.  I arrived here \
                yesterday, and my first task is to assure my dear sister of my welfare and \
                increasing confidence in the success of my undertaking."
    });
  let frankenstein_doc = schema.parse_document(&json.to_string())?;
  index_writer.add_document(frankenstein_doc);
  // This is an example, so we will only index 3 documents
  // here. You can check out tantivy's tutorial to index
  // the English wikipedia. Tantivy's indexing is rather fast.
  // Indexing 5 million articles of the English wikipedia takes
  // around 4 minutes on my computer!
  // ### Committing
  //
  // At this point our documents are not searchable.
  //
  //
  // We need to call .commit() explicitly to force the
  // index_writer to finish processing the documents in the queue,
  // flush the current index to the disk, and advertise
  // the existence of new documents.
  //
  // This call is blocking.
  index_writer.commit()?;
  // If `.commit()` returns correctly, then all of the
  // documents that have been added are guaranteed to be
  // persistently indexed.
  //
  // In the scenario of a crash or a power failure,
  // tantivy behaves as if has rolled back to its last
  // commit.
  // # Searching
  //
  // Let's search our index. Start by reloading
  // searchers in the index. This should be done
  // after every commit().
  index.load_searchers()?;
  // Afterwards create one (or more) searchers.
  //
  // You should create a searcher
  // every time you start a "search query".
  let searcher = index.searcher();
  // The query parser can interpret human queries.
  // Here, if the user does not specify which
  // field they want to search, tantivy will search
  // in both title and body.
  let query_parser = QueryParser::for_index(&index, vec![title, body]);
  // here we want to get a hit on the 'ken' in Frankenstein
  let query = query_parser.parse_query("ken")?;
  // A query defines a set of documents, as
  // well as the way they should be scored.
  //
  // A query created by the query parser is scored according
  // to a metric called Tf-Idf, and will consider
  // any document matching at least one of our terms.
  // ### Collectors
  //
  // We are not interested in all of the documents but
  // only in the top 10. Keeping track of our top 10 best documents
  // is the role of the TopCollector.
  let mut top_collector = TopCollector::with_limit(10);
  // We can now perform our query.
  searcher.search(&*query, &mut top_collector)?;
  // Our top collector now contains the 10
  // most relevant doc ids...
  let doc_addresses = top_collector.docs();
  // The actual documents still need to be
  // retrieved from Tantivy's store.
  //
  // Since the body field was not configured as stored,
  // the document returned will only contain
  // a title.
  for doc_address in doc_addresses {
    let retrieved_doc = searcher.doc(&doc_address)?;
    println!("{}", schema.to_json(&retrieved_doc));
  }
  // Wait for indexing and merging threads to shut down.
  // Usually this isn't needed, but in `main` we try to
  // delete the temporary directory and that fails on
  // Windows if the files are still open.
  index_writer.wait_merging_threads()?;
  Ok(())
 }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -132,6 +132,7 @@ mod alphanum_only;
 mod facet_tokenizer;
 mod japanese_tokenizer;
 mod lower_caser;
 mod ngram_tokenizer;
 mod raw_tokenizer;
 mod remove_long;
 mod simple_tokenizer;
@@ -144,6 +145,7 @@ pub use self::alphanum_only::AlphaNumOnlyFilter;
 pub use self::facet_tokenizer::FacetTokenizer;
 pub use self::japanese_tokenizer::JapaneseTokenizer;
 pub use self::lower_caser::LowerCaser;
 pub use self::ngram_tokenizer::NgramTokenizer;
 pub use self::raw_tokenizer::RawTokenizer;
 pub use self::remove_long::RemoveLongFilter;
 pub use self::simple_tokenizer::SimpleTokenizer;
@@ -153,8 +155,32 @@ pub use self::tokenizer::BoxedTokenizer;
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
 pub use self::tokenizer_manager::TokenizerManager;
 /// This is a function that can be used in tests and doc tests
 /// to assert a token's correctness.
 /// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
 /// public api?
 pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
    assert_eq!(
        token.position, position,
        "expected position {} but {:?}",
        position, token
    );
    assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
    assert_eq!(
        token.offset_from, from,
        "expected offset_from {} but {:?}",
        from, token
    );
    assert_eq!(
        token.offset_to, to,
        "expected offset_to {} but {:?}",
        to, token
    );
 }
 #[cfg(test)]
-mod test {
+pub mod test {
    use super::assert_token;
    use super::Token;
    use super::TokenizerManager;
@@ -162,17 +188,17 @@ mod test {
    fn test_raw_tokenizer() {
        let tokenizer_manager = TokenizerManager::default();
        let en_tokenizer = tokenizer_manager.get("raw").unwrap();
-        let mut tokens: Vec<String> = vec![];
+        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
-                tokens.push(token.text.clone());
+                tokens.push(token.clone());
            };
            en_tokenizer
                .token_stream("Hello, happy tax payer!")
                .process(&mut add_token);
        }
        assert_eq!(tokens.len(), 1);
-        assert_eq!(&tokens[0], "Hello, happy tax payer!");
+        assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
    }
    #[test]
@@ -180,20 +206,20 @@ mod test {
        let tokenizer_manager = TokenizerManager::default();
        assert!(tokenizer_manager.get("en_doesnotexist").is_none());
        let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
-        let mut tokens: Vec<String> = vec![];
+        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
-                tokens.push(token.text.clone());
+                tokens.push(token.clone());
            };
            en_tokenizer
                .token_stream("Hello, happy tax payer!")
                .process(&mut add_token);
        }
        assert_eq!(tokens.len(), 4);
-        assert_eq!(&tokens[0], "hello");
+        assert_token(&tokens[0], 0, "hello", 0, 5);
-        assert_eq!(&tokens[1], "happi");
+        assert_token(&tokens[1], 1, "happi", 7, 12);
-        assert_eq!(&tokens[2], "tax");
+        assert_token(&tokens[2], 2, "tax", 13, 16);
-        assert_eq!(&tokens[3], "payer");
+        assert_token(&tokens[3], 3, "payer", 17, 22);
    }
    #[test]
@@ -201,21 +227,87 @@ mod test {
        let tokenizer_manager = TokenizerManager::default();
        let en_tokenizer = tokenizer_manager.get("ja").unwrap();
-        let mut tokens: Vec<String> = vec![];
+        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
-                tokens.push(token.text.clone());
+                tokens.push(token.clone());
            };
            en_tokenizer
                .token_stream("野菜食べないとやばい!")
                .process(&mut add_token);
        }
        assert_eq!(tokens.len(), 5);
-        assert_eq!(&tokens[0], "野菜");
+        assert_token(&tokens[0], 0, "野菜", 0, 6);
-        assert_eq!(&tokens[1], "食べ");
+        assert_token(&tokens[1], 1, "食べ", 6, 12);
-        assert_eq!(&tokens[2], "ない");
+        assert_token(&tokens[2], 2, "ない", 12, 18);
-        assert_eq!(&tokens[3], "と");
+        assert_token(&tokens[3], 3, "と", 18, 21);
-        assert_eq!(&tokens[4], "やばい");
+        assert_token(&tokens[4], 4, "やばい", 21, 30);
    }
    #[test]
    fn test_ngram_tokenizer() {
        use super::{LowerCaser, NgramTokenizer};
        use tokenizer::tokenizer::TokenStream;
        use tokenizer::tokenizer::Tokenizer;
        let tokenizer_manager = TokenizerManager::default();
        tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
        tokenizer_manager.register(
            "ngram3",
            NgramTokenizer::new(3, 3, false).filter(LowerCaser),
        );
        tokenizer_manager.register(
            "edgegram5",
            NgramTokenizer::new(2, 5, true).filter(LowerCaser),
        );
        let tokenizer = NgramTokenizer::new(1, 2, false);
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer.token_stream("hello").process(&mut add_token);
        }
        assert_eq!(tokens.len(), 9);
        assert_token(&tokens[0], 0, "h", 0, 1);
        assert_token(&tokens[1], 0, "he", 0, 2);
        assert_token(&tokens[2], 1, "e", 1, 2);
        assert_token(&tokens[3], 1, "el", 1, 3);
        assert_token(&tokens[4], 2, "l", 2, 3);
        assert_token(&tokens[5], 2, "ll", 2, 4);
        assert_token(&tokens[6], 3, "l", 3, 4);
        assert_token(&tokens[7], 3, "lo", 3, 5);
        assert_token(&tokens[8], 4, "o", 4, 5);
        let tokenizer = tokenizer_manager.get("ngram3").unwrap();
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer.token_stream("Hello").process(&mut add_token);
        }
        assert_eq!(tokens.len(), 3);
        assert_token(&tokens[0], 0, "hel", 0, 3);
        assert_token(&tokens[1], 1, "ell", 1, 4);
        assert_token(&tokens[2], 2, "llo", 2, 5);
        let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer
                .token_stream("Frankenstein")
                .process(&mut add_token);
        }
        assert_eq!(tokens.len(), 4);
        assert_token(&tokens[0], 0, "fr", 0, 2);
        assert_token(&tokens[1], 0, "fra", 0, 3);
        assert_token(&tokens[2], 0, "fran", 0, 4);
        assert_token(&tokens[3], 0, "frank", 0, 5);
    }
    #[test]
@@ -223,20 +315,20 @@ mod test {
        let tokenizer_manager = TokenizerManager::default();
        let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
        {
-            let mut tokens: Vec<String> = vec![];
+            let mut tokens: Vec<Token> = vec![];
            {
                let mut add_token = |token: &Token| {
-                    tokens.push(token.text.clone());
+                    tokens.push(token.clone());
                };
                en_tokenizer.token_stream(" ").process(&mut add_token);
            }
            assert!(tokens.is_empty());
        }
        {
-            let mut tokens: Vec<String> = vec![];
+            let mut tokens: Vec<Token> = vec![];
            {
                let mut add_token = |token: &Token| {
-                    tokens.push(token.text.clone());
+                    tokens.push(token.clone());
                };
                en_tokenizer.token_stream(" ").process(&mut add_token);
            }
--- a/src/tokenizer/ngram_tokenizer.rs
+++ b/src/tokenizer/ngram_tokenizer.rs
@@ -0,0 +1,157 @@
 use super::{Token, TokenStream, Tokenizer};
 /// Tokenize the text by splitting words into n-grams of the given size(s)
 ///
 /// With this tokenizer, the `position` field expresses the starting offset of the ngram
 /// rather than the `token` offset.
 ///
 /// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
 ///
 /// | Term     | he  | hel | el  | ell | ll  | llo | lo |
 /// |----------|-----|-----|-----|-----|-----|-----|----|
 /// | Position | 0   | 0   | 1   | 1   | 2   | 2   | 3  |
 /// | Offsets  | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
 ///
 /// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
 ///
 /// | Term     | he  | hel | hell  | hello |
 /// |----------|-----|-----|-------|-------|
 /// | Position | 0   | 0   | 0     | 0     |
 /// | Offsets  | 0,2 | 0,3 | 0,4   | 0,5   |
 ///
 /// # Example
 ///
 /// ```
 /// extern crate tantivy;
 /// use tantivy::tokenizer::*;
 /// use tantivy::tokenizer::assert_token;
 ///
 /// # fn main() {
 /// let tokenizer = NgramTokenizer::new(2, 3, false);
 /// let mut stream = tokenizer.token_stream("hello");
 ///
 /// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
 /// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
 /// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
 /// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
 /// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
 /// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
 /// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
 /// assert!(stream.next().is_none());
 /// # }
 /// ```
 #[derive(Clone)]
 pub struct NgramTokenizer {
  /// min size of the n-gram
  min_gram: usize,
  /// max size of the n-gram
  max_gram: usize,
  /// if true, will only parse the leading edge of the input
  prefix_only: bool,
 }
 impl NgramTokenizer {
  /// Configures a new Ngram tokenizer
  pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
    assert!(min_gram > 0, "min_gram must be greater than 0");
    assert!(
      min_gram <= max_gram,
      "min_gram must not be greater than max_gram"
    );
    NgramTokenizer {
      min_gram,
      max_gram,
      prefix_only,
    }
  }
 }
 pub struct NgramTokenStream<'a> {
  text: &'a str,
  location: usize,
  text_length: usize,
  token: Token,
  min_gram: usize,
  max_gram: usize,
  gram_size: usize,
  prefix_only: bool,
 }
 impl<'a> Tokenizer<'a> for NgramTokenizer {
  type TokenStreamImpl = NgramTokenStream<'a>;
  fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
    NgramTokenStream {
      text,
      location: 0,
      text_length: text.len(),
      token: Token::default(),
      min_gram: self.min_gram,
      max_gram: self.max_gram,
      prefix_only: self.prefix_only,
      gram_size: self.min_gram,
    }
  }
 }
 impl<'a> NgramTokenStream<'a> {
  /// Get the next set of token options
  /// cycle through 1,2 (min..=max)
  /// returning None if processing should stop
  fn chomp(&mut self) -> Option<(usize, usize)> {
    // Have we exceeded the bounds of the text we are indexing?
    if self.gram_size > self.max_gram {
      if self.prefix_only {
        return None;
      }
      // since we aren't just processing edges
      // we need to reset the gram size
      self.gram_size = self.min_gram;
      // and move down the chain of letters
      self.location += 1;
    }
    let result = if (self.location + self.gram_size) <= self.text_length {
      Some((self.location, self.gram_size))
    } else {
      None
    };
    // increase the gram size for the next pass
    self.gram_size += 1;
    result
  }
 }
 impl<'a> TokenStream for NgramTokenStream<'a> {
  fn advance(&mut self) -> bool {
    // clear out working token text
    self.token.text.clear();
    if let Some((position, size)) = self.chomp() {
      self.token.position = position;
      let offset_from = position;
      let offset_to = offset_from + size;
      self.token.offset_from = offset_from;
      self.token.offset_to = offset_to;
      self.token.text.push_str(&self.text[offset_from..offset_to]);
      true
    } else {
      false
    }
  }
  fn token(&self) -> &Token {
    &self.token
  }
  fn token_mut(&mut self) -> &mut Token {
    &mut self.token
  }
 }
--- a/src/tokenizer/tokenizer.rs
+++ b/src/tokenizer/tokenizer.rs
@@ -4,6 +4,7 @@ use std::borrow::{Borrow, BorrowMut};
 use tokenizer::TokenStreamChain;
 /// Token
 #[derive(Debug, Clone)]
 pub struct Token {
    /// Offset (byte index) of the first character of the token.
    /// Offsets shall not be modified by token filters.
@@ -260,3 +261,24 @@ pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
    /// Wraps a token stream and returns the modified one.
    fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
 }
 #[cfg(test)]
 mod test {
    use super::Token;
    #[test]
    fn clone() {
        let t1 = Token {
            position: 1,
            offset_from: 2,
            offset_to: 3,
            text: "abc".to_string(),
        };
        let t2 = t1.clone();
        assert_eq!(t1.position, t2.position);
        assert_eq!(t1.offset_from, t2.offset_from);
        assert_eq!(t1.offset_to, t2.offset_to);
        assert_eq!(t1.text, t2.text);
    }
 }