mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Simple Implementation of NGram Tokenizer (#278)
* Simple Implementation of NGram Tokenizer It does not yet support edges It could probably be better in many "rusty" ways But the test is passing, so I'll call this a good stopping point for the day. * Remove Ngram from manager. Too many variations * Basic configuration model Should the extensive tests exist here? * Add Sample to provide an End to End testing * Basic Edgegram support * cleanup * code feedback * More code review feedback processed
This commit is contained in:
committed by
Paul Masurel
parent
68ee18e4e8
commit
ca74c14647
@@ -67,3 +67,6 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
|||||||
[[example]]
|
[[example]]
|
||||||
name = "simple_search"
|
name = "simple_search"
|
||||||
required-features = ["mmap"]
|
required-features = ["mmap"]
|
||||||
|
|
||||||
|
[[example]]
|
||||||
|
name = "custom_tokenizer"
|
||||||
|
|||||||
226
examples/custom_tokenizer.rs
Normal file
226
examples/custom_tokenizer.rs
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
extern crate tantivy;
|
||||||
|
extern crate tempdir;
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
extern crate serde_json;
|
||||||
|
|
||||||
|
use std::path::Path;
|
||||||
|
use tantivy::collector::TopCollector;
|
||||||
|
use tantivy::query::QueryParser;
|
||||||
|
use tantivy::schema::*;
|
||||||
|
use tantivy::tokenizer::NgramTokenizer;
|
||||||
|
use tantivy::Index;
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
// Let's create a temporary directory for the
|
||||||
|
// sake of this example
|
||||||
|
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
|
||||||
|
run_example(dir.path()).unwrap();
|
||||||
|
dir.close().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||||
|
// # Defining the schema
|
||||||
|
//
|
||||||
|
// The Tantivy index requires a very strict schema.
|
||||||
|
// The schema declares which fields are in the index,
|
||||||
|
// and for each field, its type and "the way it should
|
||||||
|
// be indexed".
|
||||||
|
|
||||||
|
// first we need to define a schema ...
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
|
// Our first field is title.
|
||||||
|
// In this example we want to use NGram searching
|
||||||
|
// we will set that to 3 characters, so any three
|
||||||
|
// char in the title should be findable.
|
||||||
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("ngram3")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let text_options = TextOptions::default()
|
||||||
|
.set_indexing_options(text_field_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("title", text_options);
|
||||||
|
|
||||||
|
// Our second field is body.
|
||||||
|
// We want full-text search for it, but we do not
|
||||||
|
// need to be able to be able to retrieve it
|
||||||
|
// for our application.
|
||||||
|
//
|
||||||
|
// We can make our index lighter and
|
||||||
|
// by omitting `STORED` flag.
|
||||||
|
schema_builder.add_text_field("body", TEXT);
|
||||||
|
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
// # Indexing documents
|
||||||
|
//
|
||||||
|
// Let's create a brand new index.
|
||||||
|
//
|
||||||
|
// This will actually just save a meta.json
|
||||||
|
// with our schema in the directory.
|
||||||
|
let index = Index::create(index_path, schema.clone())?;
|
||||||
|
|
||||||
|
// here we are registering our custome tokenizer
|
||||||
|
// this will store tokens of 3 characters each
|
||||||
|
index
|
||||||
|
.tokenizers()
|
||||||
|
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||||
|
|
||||||
|
// To insert document we need an index writer.
|
||||||
|
// There must be only one writer at a time.
|
||||||
|
// This single `IndexWriter` is already
|
||||||
|
// multithreaded.
|
||||||
|
//
|
||||||
|
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||||
|
// heap for the indexer can increase its throughput.
|
||||||
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
|
|
||||||
|
// Let's index our documents!
|
||||||
|
// We first need a handle on the title and the body field.
|
||||||
|
|
||||||
|
// ### Create a document "manually".
|
||||||
|
//
|
||||||
|
// We can create a document manually, by setting the fields
|
||||||
|
// one by one in a Document object.
|
||||||
|
let title = schema.get_field("title").unwrap();
|
||||||
|
let body = schema.get_field("body").unwrap();
|
||||||
|
|
||||||
|
let mut old_man_doc = Document::default();
|
||||||
|
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||||
|
old_man_doc.add_text(
|
||||||
|
body,
|
||||||
|
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
|
he had gone eighty-four days now without taking a fish.",
|
||||||
|
);
|
||||||
|
|
||||||
|
// ... and add it to the `IndexWriter`.
|
||||||
|
index_writer.add_document(old_man_doc);
|
||||||
|
|
||||||
|
// ### Create a document directly from json.
|
||||||
|
//
|
||||||
|
// Alternatively, we can use our schema to parse a
|
||||||
|
// document object directly from json.
|
||||||
|
// The document is a string, but we use the `json` macro
|
||||||
|
// from `serde_json` for the convenience of multi-line support.
|
||||||
|
let json = json!({
|
||||||
|
"title": "Of Mice and Men",
|
||||||
|
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
|
limbs and branches that arch over the pool"
|
||||||
|
});
|
||||||
|
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||||
|
|
||||||
|
index_writer.add_document(mice_and_men_doc);
|
||||||
|
|
||||||
|
// Multi-valued field are allowed, they are
|
||||||
|
// expressed in JSON by an array.
|
||||||
|
// The following document has two titles.
|
||||||
|
let json = json!({
|
||||||
|
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||||
|
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
|
increasing confidence in the success of my undertaking."
|
||||||
|
});
|
||||||
|
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||||
|
|
||||||
|
index_writer.add_document(frankenstein_doc);
|
||||||
|
|
||||||
|
// This is an example, so we will only index 3 documents
|
||||||
|
// here. You can check out tantivy's tutorial to index
|
||||||
|
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||||
|
// Indexing 5 million articles of the English wikipedia takes
|
||||||
|
// around 4 minutes on my computer!
|
||||||
|
|
||||||
|
// ### Committing
|
||||||
|
//
|
||||||
|
// At this point our documents are not searchable.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// We need to call .commit() explicitly to force the
|
||||||
|
// index_writer to finish processing the documents in the queue,
|
||||||
|
// flush the current index to the disk, and advertise
|
||||||
|
// the existence of new documents.
|
||||||
|
//
|
||||||
|
// This call is blocking.
|
||||||
|
index_writer.commit()?;
|
||||||
|
|
||||||
|
// If `.commit()` returns correctly, then all of the
|
||||||
|
// documents that have been added are guaranteed to be
|
||||||
|
// persistently indexed.
|
||||||
|
//
|
||||||
|
// In the scenario of a crash or a power failure,
|
||||||
|
// tantivy behaves as if has rolled back to its last
|
||||||
|
// commit.
|
||||||
|
|
||||||
|
// # Searching
|
||||||
|
//
|
||||||
|
// Let's search our index. Start by reloading
|
||||||
|
// searchers in the index. This should be done
|
||||||
|
// after every commit().
|
||||||
|
index.load_searchers()?;
|
||||||
|
|
||||||
|
// Afterwards create one (or more) searchers.
|
||||||
|
//
|
||||||
|
// You should create a searcher
|
||||||
|
// every time you start a "search query".
|
||||||
|
let searcher = index.searcher();
|
||||||
|
|
||||||
|
// The query parser can interpret human queries.
|
||||||
|
// Here, if the user does not specify which
|
||||||
|
// field they want to search, tantivy will search
|
||||||
|
// in both title and body.
|
||||||
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
|
|
||||||
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
|
let query = query_parser.parse_query("ken")?;
|
||||||
|
|
||||||
|
// A query defines a set of documents, as
|
||||||
|
// well as the way they should be scored.
|
||||||
|
//
|
||||||
|
// A query created by the query parser is scored according
|
||||||
|
// to a metric called Tf-Idf, and will consider
|
||||||
|
// any document matching at least one of our terms.
|
||||||
|
|
||||||
|
// ### Collectors
|
||||||
|
//
|
||||||
|
// We are not interested in all of the documents but
|
||||||
|
// only in the top 10. Keeping track of our top 10 best documents
|
||||||
|
// is the role of the TopCollector.
|
||||||
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
|
||||||
|
// We can now perform our query.
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
|
// Our top collector now contains the 10
|
||||||
|
// most relevant doc ids...
|
||||||
|
let doc_addresses = top_collector.docs();
|
||||||
|
|
||||||
|
// The actual documents still need to be
|
||||||
|
// retrieved from Tantivy's store.
|
||||||
|
//
|
||||||
|
// Since the body field was not configured as stored,
|
||||||
|
// the document returned will only contain
|
||||||
|
// a title.
|
||||||
|
|
||||||
|
for doc_address in doc_addresses {
|
||||||
|
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||||
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for indexing and merging threads to shut down.
|
||||||
|
// Usually this isn't needed, but in `main` we try to
|
||||||
|
// delete the temporary directory and that fails on
|
||||||
|
// Windows if the files are still open.
|
||||||
|
index_writer.wait_merging_threads()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -132,6 +132,7 @@ mod alphanum_only;
|
|||||||
mod facet_tokenizer;
|
mod facet_tokenizer;
|
||||||
mod japanese_tokenizer;
|
mod japanese_tokenizer;
|
||||||
mod lower_caser;
|
mod lower_caser;
|
||||||
|
mod ngram_tokenizer;
|
||||||
mod raw_tokenizer;
|
mod raw_tokenizer;
|
||||||
mod remove_long;
|
mod remove_long;
|
||||||
mod simple_tokenizer;
|
mod simple_tokenizer;
|
||||||
@@ -144,6 +145,7 @@ pub use self::alphanum_only::AlphaNumOnlyFilter;
|
|||||||
pub use self::facet_tokenizer::FacetTokenizer;
|
pub use self::facet_tokenizer::FacetTokenizer;
|
||||||
pub use self::japanese_tokenizer::JapaneseTokenizer;
|
pub use self::japanese_tokenizer::JapaneseTokenizer;
|
||||||
pub use self::lower_caser::LowerCaser;
|
pub use self::lower_caser::LowerCaser;
|
||||||
|
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||||
pub use self::raw_tokenizer::RawTokenizer;
|
pub use self::raw_tokenizer::RawTokenizer;
|
||||||
pub use self::remove_long::RemoveLongFilter;
|
pub use self::remove_long::RemoveLongFilter;
|
||||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||||
@@ -153,8 +155,32 @@ pub use self::tokenizer::BoxedTokenizer;
|
|||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
|
/// This is a function that can be used in tests and doc tests
|
||||||
|
/// to assert a token's correctness.
|
||||||
|
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
|
||||||
|
/// public api?
|
||||||
|
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||||
|
assert_eq!(
|
||||||
|
token.position, position,
|
||||||
|
"expected position {} but {:?}",
|
||||||
|
position, token
|
||||||
|
);
|
||||||
|
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_from, from,
|
||||||
|
"expected offset_from {} but {:?}",
|
||||||
|
from, token
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_to, to,
|
||||||
|
"expected offset_to {} but {:?}",
|
||||||
|
to, token
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
pub mod test {
|
||||||
|
use super::assert_token;
|
||||||
use super::Token;
|
use super::Token;
|
||||||
use super::TokenizerManager;
|
use super::TokenizerManager;
|
||||||
|
|
||||||
@@ -162,17 +188,17 @@ mod test {
|
|||||||
fn test_raw_tokenizer() {
|
fn test_raw_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||||
let mut tokens: Vec<String> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.text.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer
|
en_tokenizer
|
||||||
.token_stream("Hello, happy tax payer!")
|
.token_stream("Hello, happy tax payer!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert_eq!(tokens.len(), 1);
|
assert_eq!(tokens.len(), 1);
|
||||||
assert_eq!(&tokens[0], "Hello, happy tax payer!");
|
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -180,20 +206,20 @@ mod test {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
let mut tokens: Vec<String> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.text.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer
|
en_tokenizer
|
||||||
.token_stream("Hello, happy tax payer!")
|
.token_stream("Hello, happy tax payer!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_eq!(&tokens[0], "hello");
|
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||||
assert_eq!(&tokens[1], "happi");
|
assert_token(&tokens[1], 1, "happi", 7, 12);
|
||||||
assert_eq!(&tokens[2], "tax");
|
assert_token(&tokens[2], 2, "tax", 13, 16);
|
||||||
assert_eq!(&tokens[3], "payer");
|
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -201,21 +227,87 @@ mod test {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
|
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
|
||||||
|
|
||||||
let mut tokens: Vec<String> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.text.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer
|
en_tokenizer
|
||||||
.token_stream("野菜食べないとやばい!")
|
.token_stream("野菜食べないとやばい!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert_eq!(tokens.len(), 5);
|
assert_eq!(tokens.len(), 5);
|
||||||
assert_eq!(&tokens[0], "野菜");
|
assert_token(&tokens[0], 0, "野菜", 0, 6);
|
||||||
assert_eq!(&tokens[1], "食べ");
|
assert_token(&tokens[1], 1, "食べ", 6, 12);
|
||||||
assert_eq!(&tokens[2], "ない");
|
assert_token(&tokens[2], 2, "ない", 12, 18);
|
||||||
assert_eq!(&tokens[3], "と");
|
assert_token(&tokens[3], 3, "と", 18, 21);
|
||||||
assert_eq!(&tokens[4], "やばい");
|
assert_token(&tokens[4], 4, "やばい", 21, 30);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_tokenizer() {
|
||||||
|
use super::{LowerCaser, NgramTokenizer};
|
||||||
|
use tokenizer::tokenizer::TokenStream;
|
||||||
|
use tokenizer::tokenizer::Tokenizer;
|
||||||
|
|
||||||
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
|
||||||
|
tokenizer_manager.register(
|
||||||
|
"ngram3",
|
||||||
|
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
|
||||||
|
);
|
||||||
|
tokenizer_manager.register(
|
||||||
|
"edgegram5",
|
||||||
|
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
|
||||||
|
);
|
||||||
|
|
||||||
|
let tokenizer = NgramTokenizer::new(1, 2, false);
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer.token_stream("hello").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 9);
|
||||||
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
|
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||||
|
assert_token(&tokens[2], 1, "e", 1, 2);
|
||||||
|
assert_token(&tokens[3], 1, "el", 1, 3);
|
||||||
|
assert_token(&tokens[4], 2, "l", 2, 3);
|
||||||
|
assert_token(&tokens[5], 2, "ll", 2, 4);
|
||||||
|
assert_token(&tokens[6], 3, "l", 3, 4);
|
||||||
|
assert_token(&tokens[7], 3, "lo", 3, 5);
|
||||||
|
assert_token(&tokens[8], 4, "o", 4, 5);
|
||||||
|
|
||||||
|
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer.token_stream("Hello").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 3);
|
||||||
|
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||||
|
assert_token(&tokens[1], 1, "ell", 1, 4);
|
||||||
|
assert_token(&tokens[2], 2, "llo", 2, 5);
|
||||||
|
|
||||||
|
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer
|
||||||
|
.token_stream("Frankenstein")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 4);
|
||||||
|
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||||
|
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||||
|
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||||
|
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -223,20 +315,20 @@ mod test {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
{
|
{
|
||||||
let mut tokens: Vec<String> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.text.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer.token_stream(" ").process(&mut add_token);
|
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert!(tokens.is_empty());
|
assert!(tokens.is_empty());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut tokens: Vec<String> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.text.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer.token_stream(" ").process(&mut add_token);
|
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||||
}
|
}
|
||||||
|
|||||||
157
src/tokenizer/ngram_tokenizer.rs
Normal file
157
src/tokenizer/ngram_tokenizer.rs
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||||
|
///
|
||||||
|
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
|
||||||
|
/// rather than the `token` offset.
|
||||||
|
///
|
||||||
|
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
||||||
|
///
|
||||||
|
/// | Term | he | hel | el | ell | ll | llo | lo |
|
||||||
|
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
||||||
|
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
|
||||||
|
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
||||||
|
///
|
||||||
|
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||||
|
///
|
||||||
|
/// | Term | he | hel | hell | hello |
|
||||||
|
/// |----------|-----|-----|-------|-------|
|
||||||
|
/// | Position | 0 | 0 | 0 | 0 |
|
||||||
|
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// extern crate tantivy;
|
||||||
|
/// use tantivy::tokenizer::*;
|
||||||
|
/// use tantivy::tokenizer::assert_token;
|
||||||
|
///
|
||||||
|
/// # fn main() {
|
||||||
|
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||||
|
/// let mut stream = tokenizer.token_stream("hello");
|
||||||
|
///
|
||||||
|
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
|
||||||
|
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
|
||||||
|
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
|
||||||
|
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
|
||||||
|
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
|
||||||
|
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
|
||||||
|
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
|
||||||
|
/// assert!(stream.next().is_none());
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct NgramTokenizer {
|
||||||
|
/// min size of the n-gram
|
||||||
|
min_gram: usize,
|
||||||
|
/// max size of the n-gram
|
||||||
|
max_gram: usize,
|
||||||
|
/// if true, will only parse the leading edge of the input
|
||||||
|
prefix_only: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NgramTokenizer {
|
||||||
|
/// Configures a new Ngram tokenizer
|
||||||
|
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
|
||||||
|
assert!(min_gram > 0, "min_gram must be greater than 0");
|
||||||
|
assert!(
|
||||||
|
min_gram <= max_gram,
|
||||||
|
"min_gram must not be greater than max_gram"
|
||||||
|
);
|
||||||
|
|
||||||
|
NgramTokenizer {
|
||||||
|
min_gram,
|
||||||
|
max_gram,
|
||||||
|
prefix_only,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub struct NgramTokenStream<'a> {
|
||||||
|
text: &'a str,
|
||||||
|
location: usize,
|
||||||
|
text_length: usize,
|
||||||
|
token: Token,
|
||||||
|
min_gram: usize,
|
||||||
|
max_gram: usize,
|
||||||
|
gram_size: usize,
|
||||||
|
prefix_only: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||||
|
type TokenStreamImpl = NgramTokenStream<'a>;
|
||||||
|
|
||||||
|
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||||
|
NgramTokenStream {
|
||||||
|
text,
|
||||||
|
location: 0,
|
||||||
|
text_length: text.len(),
|
||||||
|
token: Token::default(),
|
||||||
|
min_gram: self.min_gram,
|
||||||
|
max_gram: self.max_gram,
|
||||||
|
prefix_only: self.prefix_only,
|
||||||
|
gram_size: self.min_gram,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> NgramTokenStream<'a> {
|
||||||
|
/// Get the next set of token options
|
||||||
|
/// cycle through 1,2 (min..=max)
|
||||||
|
/// returning None if processing should stop
|
||||||
|
fn chomp(&mut self) -> Option<(usize, usize)> {
|
||||||
|
// Have we exceeded the bounds of the text we are indexing?
|
||||||
|
if self.gram_size > self.max_gram {
|
||||||
|
if self.prefix_only {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// since we aren't just processing edges
|
||||||
|
// we need to reset the gram size
|
||||||
|
self.gram_size = self.min_gram;
|
||||||
|
|
||||||
|
// and move down the chain of letters
|
||||||
|
self.location += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = if (self.location + self.gram_size) <= self.text_length {
|
||||||
|
Some((self.location, self.gram_size))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// increase the gram size for the next pass
|
||||||
|
self.gram_size += 1;
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
// clear out working token text
|
||||||
|
self.token.text.clear();
|
||||||
|
|
||||||
|
if let Some((position, size)) = self.chomp() {
|
||||||
|
self.token.position = position;
|
||||||
|
let offset_from = position;
|
||||||
|
let offset_to = offset_from + size;
|
||||||
|
|
||||||
|
self.token.offset_from = offset_from;
|
||||||
|
self.token.offset_to = offset_to;
|
||||||
|
|
||||||
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
|
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
&self.token
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
&mut self.token
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ use std::borrow::{Borrow, BorrowMut};
|
|||||||
use tokenizer::TokenStreamChain;
|
use tokenizer::TokenStreamChain;
|
||||||
|
|
||||||
/// Token
|
/// Token
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
/// Offset (byte index) of the first character of the token.
|
/// Offset (byte index) of the first character of the token.
|
||||||
/// Offsets shall not be modified by token filters.
|
/// Offsets shall not be modified by token filters.
|
||||||
@@ -260,3 +261,24 @@ pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
|
|||||||
/// Wraps a token stream and returns the modified one.
|
/// Wraps a token stream and returns the modified one.
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::Token;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clone() {
|
||||||
|
let t1 = Token {
|
||||||
|
position: 1,
|
||||||
|
offset_from: 2,
|
||||||
|
offset_to: 3,
|
||||||
|
text: "abc".to_string(),
|
||||||
|
};
|
||||||
|
let t2 = t1.clone();
|
||||||
|
|
||||||
|
assert_eq!(t1.position, t2.position);
|
||||||
|
assert_eq!(t1.offset_from, t2.offset_from);
|
||||||
|
assert_eq!(t1.offset_to, t2.offset_to);
|
||||||
|
assert_eq!(t1.text, t2.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user