Files
tantivy/examples/pre_tokenized_text.rs
Harrison Burt 1c7c6fd591 POC: Tantivy documents as a trait (#2071)
* fix windows build (#1)

* Fix windows build

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Fix generic bugs

* Reformat code

* Add generic to index writer which I forgot about

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add doc traits

* Add field value iter

* Add value and serialization

* Adjust order

* Fix bug

* Correct type

* Rebase main and fix conflicts

* Reformat code

* Merge upstream

* Fix missing generics on single segment writer

* Add missing type export

* Add default methods for convenience

* Cleanup

* Fix more-like-this query to use standard types

* Update API and fix tests

* Add tokenizer improvements from previous commits

* Add tokenizer improvements from previous commits

* Reformat

* Fix unit tests

* Fix unit tests

* Use enum in changes

* Stage changes

* Add new deserializer logic

* Add serializer integration

* Add document deserializer

* Implement new (de)serialization api for existing types

* Fix bugs and type errors

* Add helper implementations

* Fix errors

* Reformat code

* Add unit tests and some code organisation for serialization

* Add unit tests to deserializer

* Add some small docs

* Add support for deserializing serde values

* Reformat

* Fix typo

* Fix typo

* Change repr of facet

* Remove unused trait methods

* Add child value type

* Resolve comments

* Fix build

* Fix more build errors

* Fix more build errors

* Fix the tests I missed

* Fix examples

* fix numerical order, serialize PreTok Str

* fix coverage

* rename Document to TantivyDocument, rename DocumentAccess to Document

add Binary prefix to binary de/serialization

* fix coverage

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2023-10-02 10:01:16 +02:00

137 lines
4.5 KiB
Rust

// # Pre-tokenized text example
//
// This example shows how to use pre-tokenized text. Sometimes you might
// want to index and search through text which is already split into
// tokens by some external tool.
//
// In this example we will:
// - use tantivy tokenizer to create tokens and load them directly into tantivy,
// - import tokenized text straight from json,
// - perform a search on documents with pre-tokenized text
use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
}
tokens
}
fn main() -> tantivy::Result<()> {
let index_path = TempDir::new()?;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let title_text = "The Old Man and the Sea";
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
// Content of our first document
// We create `PreTokenizedString` which contains original text and vector of tokens
let title_tok = PreTokenizedString {
text: String::from(title_text),
tokens: pre_tokenize_text(title_text),
};
println!(
"Original text: \"{}\" and tokens: {:?}",
title_tok.text, title_tok.tokens
);
let body_tok = PreTokenizedString {
text: String::from(body_text),
tokens: pre_tokenize_text(body_text),
};
// Now lets create a document and add our `PreTokenizedString`
let old_man_doc = doc!(title => title_tok, body => body_tok);
// ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc)?;
// Pretokenized text can also be fed as JSON
let short_man_json = r#"{
"title":[{
"text":"The Old Man",
"tokens":[
{"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
{"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
{"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
]
}]
}"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
index_writer.add_document(short_man_doc)?;
// Let's commit changes
index_writer.commit()?;
// ... and now is the time to query our index
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
// We want to get documents with token "Man", we will use TermQuery to do it
// Using PreTokenizedString means the tokens are stored as is avoiding stemming
// and lowercasing, which preserves full words in their original form
let query = TermQuery::new(
Term::from_field_text(title, "Man"),
IndexRecordOption::Basic,
);
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
assert_eq!(count, 2);
// Now let's print out the results.
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}
// In contrary to the previous query, when we search for the "man" term we
// should get no results, as it's not one of the indexed tokens. SimpleTokenizer
// only splits text on whitespace / punctuation.
let query = TermQuery::new(
Term::from_field_text(title, "man"),
IndexRecordOption::Basic,
);
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
assert_eq!(count, 0);
Ok(())
}