diff --git a/Cargo.toml b/Cargo.toml index 13f5a2c7c..48c2795f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,3 +67,6 @@ travis-ci = { repository = "tantivy-search/tantivy" } [[example]] name = "simple_search" required-features = ["mmap"] + +[[example]] +name = "custom_tokenizer" diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs new file mode 100644 index 000000000..4f498df05 --- /dev/null +++ b/examples/custom_tokenizer.rs @@ -0,0 +1,226 @@ +extern crate tantivy; +extern crate tempdir; + +#[macro_use] +extern crate serde_json; + +use std::path::Path; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::tokenizer::NgramTokenizer; +use tantivy::Index; +use tempdir::TempDir; + +fn main() { + // Let's create a temporary directory for the + // sake of this example + if let Ok(dir) = TempDir::new("tantivy_token_example_dir") { + run_example(dir.path()).unwrap(); + dir.close().unwrap(); + } +} + +fn run_example(index_path: &Path) -> tantivy::Result<()> { + // # Defining the schema + // + // The Tantivy index requires a very strict schema. + // The schema declares which fields are in the index, + // and for each field, its type and "the way it should + // be indexed". + + // first we need to define a schema ... + let mut schema_builder = SchemaBuilder::default(); + + // Our first field is title. + // In this example we want to use NGram searching + // we will set that to 3 characters, so any three + // char in the title should be findable. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("ngram3") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + schema_builder.add_text_field("title", text_options); + + // Our second field is body. + // We want full-text search for it, but we do not + // need to be able to be able to retrieve it + // for our application. + // + // We can make our index lighter and + // by omitting `STORED` flag. + schema_builder.add_text_field("body", TEXT); + + let schema = schema_builder.build(); + + // # Indexing documents + // + // Let's create a brand new index. + // + // This will actually just save a meta.json + // with our schema in the directory. + let index = Index::create(index_path, schema.clone())?; + + // here we are registering our custome tokenizer + // this will store tokens of 3 characters each + index + .tokenizers() + .register("ngram3", NgramTokenizer::new(3, 3, false)); + + // To insert document we need an index writer. + // There must be only one writer at a time. + // This single `IndexWriter` is already + // multithreaded. + // + // Here we use a buffer of 50MB per thread. Using a bigger + // heap for the indexer can increase its throughput. + let mut index_writer = index.writer(50_000_000)?; + + // Let's index our documents! + // We first need a handle on the title and the body field. + + // ### Create a document "manually". + // + // We can create a document manually, by setting the fields + // one by one in a Document object. + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let mut old_man_doc = Document::default(); + old_man_doc.add_text(title, "The Old Man and the Sea"); + old_man_doc.add_text( + body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish.", + ); + + // ... and add it to the `IndexWriter`. + index_writer.add_document(old_man_doc); + + // ### Create a document directly from json. + // + // Alternatively, we can use our schema to parse a + // document object directly from json. + // The document is a string, but we use the `json` macro + // from `serde_json` for the convenience of multi-line support. + let json = json!({ + "title": "Of Mice and Men", + "body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + }); + let mice_and_men_doc = schema.parse_document(&json.to_string())?; + + index_writer.add_document(mice_and_men_doc); + + // Multi-valued field are allowed, they are + // expressed in JSON by an array. + // The following document has two titles. + let json = json!({ + "title": ["Frankenstein", "The Modern Prometheus"], + "body": "You will rejoice to hear that no disaster has accompanied the commencement of an \ + enterprise which you have regarded with such evil forebodings. I arrived here \ + yesterday, and my first task is to assure my dear sister of my welfare and \ + increasing confidence in the success of my undertaking." + }); + let frankenstein_doc = schema.parse_document(&json.to_string())?; + + index_writer.add_document(frankenstein_doc); + + // This is an example, so we will only index 3 documents + // here. You can check out tantivy's tutorial to index + // the English wikipedia. Tantivy's indexing is rather fast. + // Indexing 5 million articles of the English wikipedia takes + // around 4 minutes on my computer! + + // ### Committing + // + // At this point our documents are not searchable. + // + // + // We need to call .commit() explicitly to force the + // index_writer to finish processing the documents in the queue, + // flush the current index to the disk, and advertise + // the existence of new documents. + // + // This call is blocking. + index_writer.commit()?; + + // If `.commit()` returns correctly, then all of the + // documents that have been added are guaranteed to be + // persistently indexed. + // + // In the scenario of a crash or a power failure, + // tantivy behaves as if has rolled back to its last + // commit. + + // # Searching + // + // Let's search our index. Start by reloading + // searchers in the index. This should be done + // after every commit(). + index.load_searchers()?; + + // Afterwards create one (or more) searchers. + // + // You should create a searcher + // every time you start a "search query". + let searcher = index.searcher(); + + // The query parser can interpret human queries. + // Here, if the user does not specify which + // field they want to search, tantivy will search + // in both title and body. + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // here we want to get a hit on the 'ken' in Frankenstein + let query = query_parser.parse_query("ken")?; + + // A query defines a set of documents, as + // well as the way they should be scored. + // + // A query created by the query parser is scored according + // to a metric called Tf-Idf, and will consider + // any document matching at least one of our terms. + + // ### Collectors + // + // We are not interested in all of the documents but + // only in the top 10. Keeping track of our top 10 best documents + // is the role of the TopCollector. + let mut top_collector = TopCollector::with_limit(10); + + // We can now perform our query. + searcher.search(&*query, &mut top_collector)?; + + // Our top collector now contains the 10 + // most relevant doc ids... + let doc_addresses = top_collector.docs(); + + // The actual documents still need to be + // retrieved from Tantivy's store. + // + // Since the body field was not configured as stored, + // the document returned will only contain + // a title. + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + // Wait for indexing and merging threads to shut down. + // Usually this isn't needed, but in `main` we try to + // delete the temporary directory and that fails on + // Windows if the files are still open. + index_writer.wait_merging_threads()?; + + Ok(()) +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 70bf35ab7..9cfb437bd 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -132,6 +132,7 @@ mod alphanum_only; mod facet_tokenizer; mod japanese_tokenizer; mod lower_caser; +mod ngram_tokenizer; mod raw_tokenizer; mod remove_long; mod simple_tokenizer; @@ -144,6 +145,7 @@ pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::facet_tokenizer::FacetTokenizer; pub use self::japanese_tokenizer::JapaneseTokenizer; pub use self::lower_caser::LowerCaser; +pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; @@ -153,8 +155,32 @@ pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; +/// This is a function that can be used in tests and doc tests +/// to assert a token's correctness. +/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the +/// public api? +pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { + assert_eq!( + token.position, position, + "expected position {} but {:?}", + position, token + ); + assert_eq!(token.text, text, "expected text {} but {:?}", text, token); + assert_eq!( + token.offset_from, from, + "expected offset_from {} but {:?}", + from, token + ); + assert_eq!( + token.offset_to, to, + "expected offset_to {} but {:?}", + to, token + ); +} + #[cfg(test)] -mod test { +pub mod test { + use super::assert_token; use super::Token; use super::TokenizerManager; @@ -162,17 +188,17 @@ mod test { fn test_raw_tokenizer() { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("raw").unwrap(); - let mut tokens: Vec = vec![]; + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { - tokens.push(token.text.clone()); + tokens.push(token.clone()); }; en_tokenizer .token_stream("Hello, happy tax payer!") .process(&mut add_token); } assert_eq!(tokens.len(), 1); - assert_eq!(&tokens[0], "Hello, happy tax payer!"); + assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23); } #[test] @@ -180,20 +206,20 @@ mod test { let tokenizer_manager = TokenizerManager::default(); assert!(tokenizer_manager.get("en_doesnotexist").is_none()); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); - let mut tokens: Vec = vec![]; + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { - tokens.push(token.text.clone()); + tokens.push(token.clone()); }; en_tokenizer .token_stream("Hello, happy tax payer!") .process(&mut add_token); } assert_eq!(tokens.len(), 4); - assert_eq!(&tokens[0], "hello"); - assert_eq!(&tokens[1], "happi"); - assert_eq!(&tokens[2], "tax"); - assert_eq!(&tokens[3], "payer"); + assert_token(&tokens[0], 0, "hello", 0, 5); + assert_token(&tokens[1], 1, "happi", 7, 12); + assert_token(&tokens[2], 2, "tax", 13, 16); + assert_token(&tokens[3], 3, "payer", 17, 22); } #[test] @@ -201,21 +227,87 @@ mod test { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("ja").unwrap(); - let mut tokens: Vec = vec![]; + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { - tokens.push(token.text.clone()); + tokens.push(token.clone()); }; en_tokenizer .token_stream("野菜食べないとやばい!") .process(&mut add_token); } assert_eq!(tokens.len(), 5); - assert_eq!(&tokens[0], "野菜"); - assert_eq!(&tokens[1], "食べ"); - assert_eq!(&tokens[2], "ない"); - assert_eq!(&tokens[3], "と"); - assert_eq!(&tokens[4], "やばい"); + assert_token(&tokens[0], 0, "野菜", 0, 6); + assert_token(&tokens[1], 1, "食べ", 6, 12); + assert_token(&tokens[2], 2, "ない", 12, 18); + assert_token(&tokens[3], 3, "と", 18, 21); + assert_token(&tokens[4], 4, "やばい", 21, 30); + } + + #[test] + fn test_ngram_tokenizer() { + use super::{LowerCaser, NgramTokenizer}; + use tokenizer::tokenizer::TokenStream; + use tokenizer::tokenizer::Tokenizer; + + let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false)); + tokenizer_manager.register( + "ngram3", + NgramTokenizer::new(3, 3, false).filter(LowerCaser), + ); + tokenizer_manager.register( + "edgegram5", + NgramTokenizer::new(2, 5, true).filter(LowerCaser), + ); + + let tokenizer = NgramTokenizer::new(1, 2, false); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { + tokens.push(token.clone()); + }; + tokenizer.token_stream("hello").process(&mut add_token); + } + assert_eq!(tokens.len(), 9); + assert_token(&tokens[0], 0, "h", 0, 1); + assert_token(&tokens[1], 0, "he", 0, 2); + assert_token(&tokens[2], 1, "e", 1, 2); + assert_token(&tokens[3], 1, "el", 1, 3); + assert_token(&tokens[4], 2, "l", 2, 3); + assert_token(&tokens[5], 2, "ll", 2, 4); + assert_token(&tokens[6], 3, "l", 3, 4); + assert_token(&tokens[7], 3, "lo", 3, 5); + assert_token(&tokens[8], 4, "o", 4, 5); + + let tokenizer = tokenizer_manager.get("ngram3").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { + tokens.push(token.clone()); + }; + tokenizer.token_stream("Hello").process(&mut add_token); + } + assert_eq!(tokens.len(), 3); + assert_token(&tokens[0], 0, "hel", 0, 3); + assert_token(&tokens[1], 1, "ell", 1, 4); + assert_token(&tokens[2], 2, "llo", 2, 5); + + let tokenizer = tokenizer_manager.get("edgegram5").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { + tokens.push(token.clone()); + }; + tokenizer + .token_stream("Frankenstein") + .process(&mut add_token); + } + assert_eq!(tokens.len(), 4); + assert_token(&tokens[0], 0, "fr", 0, 2); + assert_token(&tokens[1], 0, "fra", 0, 3); + assert_token(&tokens[2], 0, "fran", 0, 4); + assert_token(&tokens[3], 0, "frank", 0, 5); } #[test] @@ -223,20 +315,20 @@ mod test { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); { - let mut tokens: Vec = vec![]; + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { - tokens.push(token.text.clone()); + tokens.push(token.clone()); }; en_tokenizer.token_stream(" ").process(&mut add_token); } assert!(tokens.is_empty()); } { - let mut tokens: Vec = vec![]; + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { - tokens.push(token.text.clone()); + tokens.push(token.clone()); }; en_tokenizer.token_stream(" ").process(&mut add_token); } diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs new file mode 100644 index 000000000..f19fef3e8 --- /dev/null +++ b/src/tokenizer/ngram_tokenizer.rs @@ -0,0 +1,157 @@ +use super::{Token, TokenStream, Tokenizer}; + +/// Tokenize the text by splitting words into n-grams of the given size(s) +/// +/// With this tokenizer, the `position` field expresses the starting offset of the ngram +/// rather than the `token` offset. +/// +/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false) +/// +/// | Term | he | hel | el | ell | ll | llo | lo | +/// |----------|-----|-----|-----|-----|-----|-----|----| +/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 | +/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5| +/// +/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**) +/// +/// | Term | he | hel | hell | hello | +/// |----------|-----|-----|-------|-------| +/// | Position | 0 | 0 | 0 | 0 | +/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 | +/// +/// # Example +/// +/// ``` +/// extern crate tantivy; +/// use tantivy::tokenizer::*; +/// use tantivy::tokenizer::assert_token; +/// +/// # fn main() { +/// let tokenizer = NgramTokenizer::new(2, 3, false); +/// let mut stream = tokenizer.token_stream("hello"); +/// +/// assert_token(stream.next().unwrap(), 0, "he", 0, 2); +/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3); +/// assert_token(stream.next().unwrap(), 1, "el", 1, 3); +/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4); +/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4); +/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5); +/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5); +/// assert!(stream.next().is_none()); +/// # } +/// ``` +#[derive(Clone)] +pub struct NgramTokenizer { + /// min size of the n-gram + min_gram: usize, + /// max size of the n-gram + max_gram: usize, + /// if true, will only parse the leading edge of the input + prefix_only: bool, +} + +impl NgramTokenizer { + /// Configures a new Ngram tokenizer + pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer { + assert!(min_gram > 0, "min_gram must be greater than 0"); + assert!( + min_gram <= max_gram, + "min_gram must not be greater than max_gram" + ); + + NgramTokenizer { + min_gram, + max_gram, + prefix_only, + } + } +} +pub struct NgramTokenStream<'a> { + text: &'a str, + location: usize, + text_length: usize, + token: Token, + min_gram: usize, + max_gram: usize, + gram_size: usize, + prefix_only: bool, +} + +impl<'a> Tokenizer<'a> for NgramTokenizer { + type TokenStreamImpl = NgramTokenStream<'a>; + + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { + NgramTokenStream { + text, + location: 0, + text_length: text.len(), + token: Token::default(), + min_gram: self.min_gram, + max_gram: self.max_gram, + prefix_only: self.prefix_only, + gram_size: self.min_gram, + } + } +} + +impl<'a> NgramTokenStream<'a> { + /// Get the next set of token options + /// cycle through 1,2 (min..=max) + /// returning None if processing should stop + fn chomp(&mut self) -> Option<(usize, usize)> { + // Have we exceeded the bounds of the text we are indexing? + if self.gram_size > self.max_gram { + if self.prefix_only { + return None; + } + + // since we aren't just processing edges + // we need to reset the gram size + self.gram_size = self.min_gram; + + // and move down the chain of letters + self.location += 1; + } + + let result = if (self.location + self.gram_size) <= self.text_length { + Some((self.location, self.gram_size)) + } else { + None + }; + + // increase the gram size for the next pass + self.gram_size += 1; + + result + } +} + +impl<'a> TokenStream for NgramTokenStream<'a> { + fn advance(&mut self) -> bool { + // clear out working token text + self.token.text.clear(); + + if let Some((position, size)) = self.chomp() { + self.token.position = position; + let offset_from = position; + let offset_to = offset_from + size; + + self.token.offset_from = offset_from; + self.token.offset_to = offset_to; + + self.token.text.push_str(&self.text[offset_from..offset_to]); + + true + } else { + false + } + } + + fn token(&self) -> &Token { + &self.token + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.token + } +} diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 104cd0e7e..d38ff7b69 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -4,6 +4,7 @@ use std::borrow::{Borrow, BorrowMut}; use tokenizer::TokenStreamChain; /// Token +#[derive(Debug, Clone)] pub struct Token { /// Offset (byte index) of the first character of the token. /// Offsets shall not be modified by token filters. @@ -260,3 +261,24 @@ pub trait TokenFilter: Clone { /// Wraps a token stream and returns the modified one. fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream; } + +#[cfg(test)] +mod test { + use super::Token; + + #[test] + fn clone() { + let t1 = Token { + position: 1, + offset_from: 2, + offset_to: 3, + text: "abc".to_string(), + }; + let t2 = t1.clone(); + + assert_eq!(t1.position, t2.position); + assert_eq!(t1.offset_from, t2.offset_from); + assert_eq!(t1.offset_to, t2.offset_to); + assert_eq!(t1.text, t2.text); + } +}