From 0519056bd8dbb6e88587db169980ab38fd990f2e Mon Sep 17 00:00:00 2001 From: kkoziara Date: Thu, 7 Nov 2019 02:10:56 +0100 Subject: [PATCH] Added handling of pre-tokenized text fields (#642). (#669) * Added handling of pre-tokenized text fields (#642). * * Updated changelog and examples concerning #642. * Added tokenized_text method to Value implementation. * Implemented From for TokenizedStream. * * Removed tokenized flag from TextOptions and code reliance on the flag. * Changed naming to use word "pre-tokenized" instead of "tokenized". * Updated example code. * Fixed comments. * Minor code refactoring. Test improvements. --- CHANGELOG.md | 2 + examples/pre_tokenized_text.rs | 140 ++++++++++++++++++++++ src/indexer/segment_writer.rs | 55 ++++++--- src/schema/document.rs | 11 ++ src/schema/field_type.rs | 95 ++++++++++++++- src/schema/value.rs | 58 +++++++++ src/tokenizer/mod.rs | 3 + src/tokenizer/tokenized_string.rs | 189 ++++++++++++++++++++++++++++++ src/tokenizer/tokenizer.rs | 2 +- 9 files changed, 534 insertions(+), 21 deletions(-) create mode 100644 examples/pre_tokenized_text.rs create mode 100644 src/tokenizer/tokenized_string.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 268e89493..0db60c6ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Tantivy 0.11.0 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock) - Add footer with some metadata to index files. #605 (@fdb-hiroshima) - TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) +- Added handling of pre-tokenized text fields (#642), which will enable users to + load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara) - Fix crash when committing multiple times with deleted documents. #681 (@brainlock) ## How to update? diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs new file mode 100644 index 000000000..af3f3a981 --- /dev/null +++ b/examples/pre_tokenized_text.rs @@ -0,0 +1,140 @@ +// # Pre-tokenized text example +// +// This example shows how to use pre-tokenized text. Sometimes yout might +// want to index and search through text which is already split into +// tokens by some external tool. +// +// In this example we will: +// - use tantivy tokenizer to create tokens and load them directly into tantivy, +// - import tokenized text straight from json, +// - perform a search on documents with pre-tokenized text + +use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; + +use tantivy::collector::{Count, TopDocs}; +use tantivy::query::TermQuery; +use tantivy::schema::*; +use tantivy::{doc, Index, ReloadPolicy}; +use tempfile::TempDir; + +fn pre_tokenize_text(text: &str) -> Vec { + let mut token_stream = SimpleTokenizer.token_stream(text); + let mut tokens = vec![]; + while token_stream.advance() { + tokens.push(token_stream.token().clone()); + } + tokens +} + +fn main() -> tantivy::Result<()> { + let index_path = TempDir::new()?; + + let mut schema_builder = Schema::builder(); + + schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_text_field("body", TEXT); + + let schema = schema_builder.build(); + + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + // We can create a document manually, by setting the fields + // one by one in a Document object. + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let title_text = "The Old Man and the Sea"; + let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream"; + + // Content of our first document + // We create `PreTokenizedString` which contains original text and vector of tokens + let title_tok = PreTokenizedString { + text: String::from(title_text), + tokens: pre_tokenize_text(title_text), + }; + + println!( + "Original text: \"{}\" and tokens: {:?}", + title_tok.text, title_tok.tokens + ); + + let body_tok = PreTokenizedString { + text: String::from(body_text), + tokens: pre_tokenize_text(body_text), + }; + + // Now lets create a document and add our `PreTokenizedString` using + // `add_pre_tokenized_text` method of `Document` + let mut old_man_doc = Document::default(); + old_man_doc.add_pre_tokenized_text(title, &title_tok); + old_man_doc.add_pre_tokenized_text(body, &body_tok); + + // ... now let's just add it to the IndexWriter + index_writer.add_document(old_man_doc); + + // Pretokenized text can also be fed as JSON + let short_man_json = r#"{ + "title":[{ + "text":"The Old Man", + "tokens":[ + {"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1}, + {"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1}, + {"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1} + ] + }] + }"#; + + let short_man_doc = schema.parse_document(&short_man_json)?; + + index_writer.add_document(short_man_doc); + + // Let's commit changes + index_writer.commit()?; + + // ... and now is the time to query our index + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) + .try_into()?; + + let searcher = reader.searcher(); + + // We want to get documents with token "Man", we will use TermQuery to do it + // Using PreTokenizedString means the tokens are stored as is avoiding stemming + // and lowercasing, which preserves full words in their original form + let query = TermQuery::new( + Term::from_field_text(title, "Man"), + IndexRecordOption::Basic, + ); + + let (top_docs, count) = searcher + .search(&query, &(TopDocs::with_limit(2), Count)) + .unwrap(); + + assert_eq!(count, 2); + + for (_score, doc_address) in top_docs { + let retrieved_doc = searcher.doc(doc_address)?; + println!("Document: {}", schema.to_json(&retrieved_doc)); + } + + // In contrary to the previous query, when we search for the "man" term we + // should get no results, as it's not one of the indexed tokens. SimpleTokenizer + // only splits text on whitespace / punctuation. + + let query = TermQuery::new( + Term::from_field_text(title, "man"), + IndexRecordOption::Basic, + ); + + let (_top_docs, count) = searcher + .search(&query, &(TopDocs::with_limit(2), Count)) + .unwrap(); + + assert_eq!(count, 0); + + Ok(()) +} diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 638e9de4d..5cfdccdcd 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -13,7 +13,8 @@ use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::BoxedTokenizer; use crate::tokenizer::FacetTokenizer; -use crate::tokenizer::{TokenStream, Tokenizer}; +use crate::tokenizer::PreTokenizedStream; +use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer}; use crate::DocId; use crate::Opstamp; use crate::Result; @@ -158,26 +159,44 @@ impl SegmentWriter { } } FieldType::Str(_) => { - let num_tokens = if let Some(ref mut tokenizer) = - self.tokenizers[field.field_id() as usize] - { - let texts: Vec<&str> = field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::Str(ref text) => Some(text.as_str()), - _ => None, - }) - .collect(); - if texts.is_empty() { - 0 - } else { - let mut token_stream = tokenizer.token_stream_texts(&texts[..]); - self.multifield_postings - .index_text(doc_id, field, &mut token_stream) + let mut token_streams: Vec> = vec![]; + let mut offsets = vec![]; + let mut total_offset = 0; + + for field_value in field_values { + match field_value.value() { + Value::PreTokStr(tok_str) => { + offsets.push(total_offset); + if let Some(last_token) = tok_str.tokens.last() { + total_offset += last_token.offset_to; + } + + token_streams + .push(Box::new(PreTokenizedStream::from(tok_str.clone()))); + } + Value::Str(ref text) => { + if let Some(ref mut tokenizer) = + self.tokenizers[field.field_id() as usize] + { + offsets.push(total_offset); + total_offset += text.len(); + + token_streams.push(tokenizer.token_stream(text)); + } + } + _ => (), } - } else { + } + + let num_tokens = if token_streams.is_empty() { 0 + } else { + let mut token_stream: Box = + Box::new(TokenStreamChain::new(offsets, token_streams)); + self.multifield_postings + .index_text(doc_id, field, &mut token_stream) }; + self.fieldnorms_writer.record(doc_id, field, num_tokens); } FieldType::U64(ref int_option) => { diff --git a/src/schema/document.rs b/src/schema/document.rs index 055e9bc7a..6cab58bfd 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,6 +1,7 @@ use super::*; use crate::common::BinarySerializable; use crate::common::VInt; +use crate::tokenizer::PreTokenizedString; use crate::DateTime; use itertools::Itertools; use std::io::{self, Read, Write}; @@ -78,6 +79,16 @@ impl Document { self.add(FieldValue::new(field, value)); } + /// Add a pre-tokenized text field. + pub fn add_pre_tokenized_text( + &mut self, + field: Field, + pre_tokenized_text: &PreTokenizedString, + ) { + let value = Value::PreTokStr(pre_tokenized_text.clone()); + self.add(FieldValue::new(field, value)); + } + /// Add a u64 field pub fn add_u64(&mut self, field: Field, value: u64) { self.add(FieldValue::new(field, Value::U64(value))); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 72e5c0604..27aaee8fc 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,11 +1,11 @@ use base64::decode; -use crate::schema::{IntOptions, TextOptions}; - use crate::schema::Facet; use crate::schema::IndexRecordOption; use crate::schema::TextFieldIndexing; use crate::schema::Value; +use crate::schema::{IntOptions, TextOptions}; +use crate::tokenizer::PreTokenizedString; use serde_json::Value as JsonValue; /// Possible error that may occur while parsing a field value @@ -169,6 +169,28 @@ impl FieldType { Err(ValueParsingError::TypeError(msg)) } }, + JsonValue::Object(_) => match *self { + FieldType::Str(_) => { + if let Ok(tok_str_val) = + serde_json::from_value::(json.clone()) + { + Ok(Value::PreTokStr(tok_str_val)) + } else { + let msg = format!( + "Json value {:?} cannot be translated to PreTokenizedString.", + json + ); + Err(ValueParsingError::TypeError(msg)) + } + } + _ => { + let msg = format!( + "Json value not supported error {:?}. Expected {:?}", + json, self + ); + Err(ValueParsingError::TypeError(msg)) + } + }, _ => { let msg = format!( "Json value not supported error {:?}. Expected {:?}", @@ -184,7 +206,9 @@ impl FieldType { mod tests { use super::FieldType; use crate::schema::field_type::ValueParsingError; + use crate::schema::TextOptions; use crate::schema::Value; + use crate::tokenizer::{PreTokenizedString, Token}; #[test] fn test_bytes_value_from_json() { @@ -205,4 +229,71 @@ mod tests { _ => panic!("Expected parse failure for invalid base64"), } } + + #[test] + fn test_pre_tok_str_value_from_json() { + let pre_tokenized_string_json = r#"{ + "text": "The Old Man", + "tokens": [ + { + "offset_from": 0, + "offset_to": 3, + "position": 0, + "text": "The", + "position_length": 1 + }, + { + "offset_from": 4, + "offset_to": 7, + "position": 1, + "text": "Old", + "position_length": 1 + }, + { + "offset_from": 8, + "offset_to": 11, + "position": 2, + "text": "Man", + "position_length": 1 + } + ] +}"#; + + let expected_value = Value::PreTokStr(PreTokenizedString { + text: String::from("The Old Man"), + tokens: vec![ + Token { + offset_from: 0, + offset_to: 3, + position: 0, + text: String::from("The"), + position_length: 1, + }, + Token { + offset_from: 4, + offset_to: 7, + position: 1, + text: String::from("Old"), + position_length: 1, + }, + Token { + offset_from: 8, + offset_to: 11, + position: 2, + text: String::from("Man"), + position_length: 1, + }, + ], + }); + + let deserialized_value = FieldType::Str(TextOptions::default()) + .value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap()) + .unwrap(); + + assert_eq!(deserialized_value, expected_value); + + let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap(); + + assert_eq!(serialized_value_json, pre_tokenized_string_json); + } } diff --git a/src/schema/value.rs b/src/schema/value.rs index 8333f6ac2..e8fedd68c 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,4 +1,5 @@ use crate::schema::Facet; +use crate::tokenizer::PreTokenizedString; use crate::DateTime; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -10,6 +11,8 @@ use std::{cmp::Ordering, fmt}; pub enum Value { /// The str type is used for any text information. Str(String), + /// Pre-tokenized str type, + PreTokStr(PreTokenizedString), /// Unsigned 64-bits Integer `u64` U64(u64), /// Signed 64-bits Integer `i64` @@ -29,6 +32,7 @@ impl Ord for Value { fn cmp(&self, other: &Self) -> Ordering { match (self, other) { (Value::Str(l), Value::Str(r)) => l.cmp(r), + (Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r), (Value::U64(l), Value::U64(r)) => l.cmp(r), (Value::I64(l), Value::I64(r)) => l.cmp(r), (Value::Date(l), Value::Date(r)) => l.cmp(r), @@ -44,6 +48,8 @@ impl Ord for Value { } (Value::Str(_), _) => Ordering::Less, (_, Value::Str(_)) => Ordering::Greater, + (Value::PreTokStr(_), _) => Ordering::Less, + (_, Value::PreTokStr(_)) => Ordering::Greater, (Value::U64(_), _) => Ordering::Less, (_, Value::U64(_)) => Ordering::Greater, (Value::I64(_), _) => Ordering::Less, @@ -65,6 +71,7 @@ impl Serialize for Value { { match *self { Value::Str(ref v) => serializer.serialize_str(v), + Value::PreTokStr(ref v) => v.serialize(serializer), Value::U64(u) => serializer.serialize_u64(u), Value::I64(u) => serializer.serialize_i64(u), Value::F64(u) => serializer.serialize_f64(u), @@ -124,6 +131,15 @@ impl Value { } } + /// Returns the tokenized text, provided the value is of the `PreTokStr` type. + /// (Returns None if the value is not of the `PreTokStr` type). + pub fn tokenized_text(&self) -> Option<&PreTokenizedString> { + match *self { + Value::PreTokStr(ref tok_text) => Some(tok_text), + _ => None, + } + } + /// Returns the u64-value, provided the value is of the `U64` type. /// /// # Panics @@ -221,6 +237,7 @@ mod binary_serialize { use super::Value; use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable}; use crate::schema::Facet; + use crate::tokenizer::PreTokenizedString; use chrono::{TimeZone, Utc}; use std::io::{self, Read, Write}; @@ -231,6 +248,11 @@ mod binary_serialize { const BYTES_CODE: u8 = 4; const DATE_CODE: u8 = 5; const F64_CODE: u8 = 6; + const EXT_CODE: u8 = 7; + + // extended types + + const TOK_STR_CODE: u8 = 0; impl BinarySerializable for Value { fn serialize(&self, writer: &mut W) -> io::Result<()> { @@ -239,6 +261,18 @@ mod binary_serialize { TEXT_CODE.serialize(writer)?; text.serialize(writer) } + Value::PreTokStr(ref tok_str) => { + EXT_CODE.serialize(writer)?; + TOK_STR_CODE.serialize(writer)?; + if let Ok(text) = serde_json::to_string(tok_str) { + text.serialize(writer) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + "Failed to dump Value::PreTokStr(_) to json.", + )) + } + } Value::U64(ref val) => { U64_CODE.serialize(writer)?; val.serialize(writer) @@ -290,6 +324,30 @@ mod binary_serialize { } HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)), BYTES_CODE => Ok(Value::Bytes(Vec::::deserialize(reader)?)), + EXT_CODE => { + let ext_type_code = u8::deserialize(reader)?; + match ext_type_code { + TOK_STR_CODE => { + let str_val = String::deserialize(reader)?; + if let Ok(value) = serde_json::from_str::(&str_val) + { + Ok(Value::PreTokStr(value)) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + "Failed to parse string data as Value::PreTokStr(_).", + )) + } + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "No extened field type is associated with code {:?}", + ext_type_code + ), + )), + } + } _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code), diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 7e0f5d1f9..72cf980a4 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -136,6 +136,7 @@ mod simple_tokenizer; mod stemmer; mod stop_word_filter; mod token_stream_chain; +mod tokenized_string; mod tokenizer; mod tokenizer_manager; @@ -152,7 +153,9 @@ pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub use self::tokenizer::BoxedTokenizer; +pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; + pub use self::tokenizer_manager::TokenizerManager; /// Maximum authorized len (in bytes) for a token. diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs new file mode 100644 index 000000000..50da55e40 --- /dev/null +++ b/src/tokenizer/tokenized_string.rs @@ -0,0 +1,189 @@ +use crate::tokenizer::{Token, TokenStream, TokenStreamChain}; +use std::cmp::Ordering; + +/// Struct representing pre-tokenized text +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct PreTokenizedString { + /// Original text + pub text: String, + /// Tokens derived from the text + pub tokens: Vec, +} + +impl Ord for PreTokenizedString { + fn cmp(&self, other: &Self) -> Ordering { + self.text.cmp(&other.text) + } +} + +impl PartialOrd for PreTokenizedString { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// TokenStream implementation which wraps PreTokenizedString +pub struct PreTokenizedStream { + tokenized_string: PreTokenizedString, + current_token: i64, +} + +impl From for PreTokenizedStream { + fn from(s: PreTokenizedString) -> PreTokenizedStream { + PreTokenizedStream { + tokenized_string: s, + current_token: -1, + } + } +} + +impl PreTokenizedStream { + /// Creates a TokenStream from PreTokenizedString array + pub fn chain_tokenized_strings<'a>( + tok_strings: &'a [&'a PreTokenizedString], + ) -> Box { + if tok_strings.len() == 1 { + Box::new(PreTokenizedStream::from((*tok_strings[0]).clone())) + } else { + let mut offsets = vec![]; + let mut total_offset = 0; + for &tok_string in tok_strings { + offsets.push(total_offset); + if let Some(last_token) = tok_string.tokens.last() { + total_offset += last_token.offset_to; + } + } + let token_streams: Vec<_> = tok_strings + .iter() + .map(|tok_string| PreTokenizedStream::from((*tok_string).clone())) + .collect(); + Box::new(TokenStreamChain::new(offsets, token_streams)) + } + } +} + +impl TokenStream for PreTokenizedStream { + fn advance(&mut self) -> bool { + self.current_token += 1; + self.current_token < self.tokenized_string.tokens.len() as i64 + } + + fn token(&self) -> &Token { + assert!( + self.current_token >= 0, + "TokenStream not initialized. You should call advance() at least once." + ); + &self.tokenized_string.tokens[self.current_token as usize] + } + + fn token_mut(&mut self) -> &mut Token { + assert!( + self.current_token >= 0, + "TokenStream not initialized. You should call advance() at least once." + ); + &mut self.tokenized_string.tokens[self.current_token as usize] + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + use crate::tokenizer::Token; + + #[test] + fn test_tokenized_stream() { + let tok_text = PreTokenizedString { + text: String::from("A a"), + tokens: vec![ + Token { + offset_from: 0, + offset_to: 1, + position: 0, + text: String::from("A"), + position_length: 1, + }, + Token { + offset_from: 2, + offset_to: 3, + position: 1, + text: String::from("a"), + position_length: 1, + }, + ], + }; + + let mut token_stream = PreTokenizedStream::from(tok_text.clone()); + + for expected_token in tok_text.tokens { + assert!(token_stream.advance()); + assert_eq!(token_stream.token(), &expected_token); + } + assert!(!token_stream.advance()); + } + + #[test] + fn test_chain_tokenized_strings() { + let tok_text = PreTokenizedString { + text: String::from("A a"), + tokens: vec![ + Token { + offset_from: 0, + offset_to: 1, + position: 0, + text: String::from("A"), + position_length: 1, + }, + Token { + offset_from: 2, + offset_to: 3, + position: 1, + text: String::from("a"), + position_length: 1, + }, + ], + }; + + let chain_parts = vec![&tok_text, &tok_text]; + + let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); + + let expected_tokens = vec![ + Token { + offset_from: 0, + offset_to: 1, + position: 0, + text: String::from("A"), + position_length: 1, + }, + Token { + offset_from: 2, + offset_to: 3, + position: 1, + text: String::from("a"), + position_length: 1, + }, + Token { + offset_from: 3, + offset_to: 4, + position: 3, + text: String::from("A"), + position_length: 1, + }, + Token { + offset_from: 5, + offset_to: 6, + position: 4, + text: String::from("a"), + position_length: 1, + }, + ]; + + for expected_token in expected_tokens { + assert!(token_stream.advance()); + assert_eq!(token_stream.token(), &expected_token); + } + assert!(!token_stream.advance()); + } +} diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 9557a7247..4b36c317c 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -4,7 +4,7 @@ use crate::tokenizer::TokenStreamChain; use std::borrow::{Borrow, BorrowMut}; /// Token -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct Token { /// Offset (byte index) of the first character of the token. /// Offsets shall not be modified by token filters.