* Removed tokenized flag from TextOptions and code reliance on the flag.

* Changed naming to use word "pre-tokenized" instead of "tokenized".
* Updated example code.
* Fixed comments.
This commit is contained in:
kkoziara
2019-10-26 16:31:52 +02:00
parent 20d2235d4d
commit d87b7f230d
13 changed files with 190 additions and 160 deletions

View File

@@ -11,7 +11,7 @@ Tantivy 0.11.0
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
- Added handling of pre-tokenized text fields (#642), which will enable users to
load tokens created outside tantivy. See usage in examples/pre_tokenized_text.
load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara)
## How to update?

View File

@@ -9,7 +9,7 @@
// - import tokenized text straight from json,
// - perform a search on documents with pre-tokenized text
use tantivy::tokenizer::{SimpleTokenizer, Token, TokenStream, TokenizedString, Tokenizer};
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
@@ -17,11 +17,11 @@ use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn tokenize_it(text: &str) -> Vec<Token> {
let mut ts = SimpleTokenizer.token_stream(text);
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokens = vec![];
while ts.advance() {
tokens.push(ts.token().clone());
while token_stream.advance() {
tokens.push(token_stream.token().clone());
}
tokens
}
@@ -31,11 +31,8 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
// now we add `TOKENIZED` `TextOptions` to mark field as pre-tokenized
// in addition the title will be also stored, so we can see it in
// returned results
schema_builder.add_text_field("title", TEXT | STORED | TOKENIZED);
schema_builder.add_text_field("body", TEXT | TOKENIZED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
@@ -52,10 +49,10 @@ fn main() -> tantivy::Result<()> {
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
// Content of our first document
// We create `TokenizedString` which contains original text and vector of tokens
let title_tok = TokenizedString {
// We create `PreTokenizedString` which contains original text and vector of tokens
let title_tok = PreTokenizedString {
text: String::from(title_text),
tokens: tokenize_it(title_text),
tokens: pre_tokenize_text(title_text),
};
println!(
@@ -63,21 +60,21 @@ fn main() -> tantivy::Result<()> {
title_tok.text, title_tok.tokens
);
let body_tok = TokenizedString {
let body_tok = PreTokenizedString {
text: String::from(body_text),
tokens: tokenize_it(body_text),
tokens: pre_tokenize_text(body_text),
};
// Now lets create a document and add our `TokenizedString` using
// `add_tokenized_text` method of `Document`
// Now lets create a document and add our `PreTokenizedString` using
// `add_pre_tokenized_text` method of `Document`
let mut old_man_doc = Document::default();
old_man_doc.add_tokenized_text(title, &title_tok);
old_man_doc.add_tokenized_text(body, &body_tok);
old_man_doc.add_pre_tokenized_text(title, &title_tok);
old_man_doc.add_pre_tokenized_text(body, &body_tok);
// ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc);
// `Document` can be obtained directly from JSON:
// Pretokenized text can also be fed as JSON
let short_man_json = r#"{
"title":[{
"text":"The Old Man",
@@ -106,6 +103,8 @@ fn main() -> tantivy::Result<()> {
let searcher = reader.searcher();
// We want to get documents with token "Man", we will use TermQuery to do it
// Using PreTokenizedString means the tokens are stored as is avoiding stemming
// and lowercasing, which preserves full words in their original form
let query = TermQuery::new(
Term::from_field_text(title, "Man"),
IndexRecordOption::Basic,
@@ -124,14 +123,14 @@ fn main() -> tantivy::Result<()> {
// In contrary to the previous query, when we search for the "man" term we
// should get no results, as it's not one of the indexed tokens. SimpleTokenizer
// only splits text on whitespace / interpunction.
// only splits text on whitespace / punctuation.
let query = TermQuery::new(
Term::from_field_text(title, "nan"),
Term::from_field_text(title, "man"),
IndexRecordOption::Basic,
);
let (top_docs, count) = searcher
let (_top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();

View File

@@ -285,6 +285,6 @@ mod tests {
payload: None,
};
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false,"tokenized":false}}],"opstamp":0}"#);
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
}
}

View File

@@ -13,8 +13,8 @@ use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::FacetTokenizer;
use crate::tokenizer::{TokenStream, Tokenizer};
use crate::tokenizer::{TokenizedStream, TokenizedString};
use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
use crate::DocId;
use crate::Opstamp;
use crate::Result;
@@ -158,47 +158,46 @@ impl SegmentWriter {
}
}
}
FieldType::Str(ref text_options) => {
let num_tokens = if text_options.is_tokenized() {
let tok_strings: Vec<&TokenizedString> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::TokStr(ref tok_str) => Some(tok_str),
_ => None,
})
.collect();
if tok_strings.is_empty() {
0
} else {
let mut token_stream =
TokenizedStream::chain_tokenized_strings(&tok_strings[..]);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
}
} else {
if let Some(ref mut tokenizer) = self.tokenizers[field.field_id() as usize]
{
let texts: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
if texts.is_empty() {
0
} else {
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
self.multifield_postings.index_text(
doc_id,
field,
&mut token_stream,
)
FieldType::Str(_) => {
let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
let mut offsets = vec![];
let mut total_offset = 0;
for field_value in field_values {
match field_value.value() {
Value::PreTokStr(tok_str) => {
offsets.push(total_offset);
total_offset += match tok_str.tokens.last() {
Some(token) => token.offset_to,
None => 0,
};
token_streams
.push(Box::new(PreTokenizedStream::from(tok_str.clone())));
}
} else {
0
Value::Str(ref text) => {
if let Some(ref mut tokenizer) =
self.tokenizers[field.field_id() as usize]
{
offsets.push(total_offset);
total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
}
}
_ => (),
}
}
let num_tokens = if token_streams.is_empty() {
0
} else {
let mut token_stream: Box<dyn TokenStream> =
Box::new(TokenStreamChain::new(offsets, token_streams));
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
};
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
FieldType::U64(ref int_option) => {

View File

@@ -1,7 +1,7 @@
use super::*;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::tokenizer::TokenizedString;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
use itertools::Itertools;
use std::io::{self, Read, Write};
@@ -79,9 +79,13 @@ impl Document {
self.add(FieldValue::new(field, value));
}
/// Add a text field with tokens.
pub fn add_tokenized_text(&mut self, field: Field, tokenized_text: &TokenizedString) {
let value = Value::TokStr(tokenized_text.clone());
/// Add a pre-tokenized text field.
pub fn add_pre_tokenized_text(
&mut self,
field: Field,
pre_tokenized_text: &PreTokenizedString,
) {
let value = Value::PreTokStr(pre_tokenized_text.clone());
self.add(FieldValue::new(field, value));
}

View File

@@ -280,8 +280,7 @@ mod tests {
"record": "position",
"tokenizer": "default"
},
"stored": false,
"tokenized": false
"stored": false
}
}"#;
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();

View File

@@ -5,7 +5,7 @@ use crate::schema::IndexRecordOption;
use crate::schema::TextFieldIndexing;
use crate::schema::Value;
use crate::schema::{IntOptions, TextOptions};
use crate::tokenizer::TokenizedString;
use crate::tokenizer::PreTokenizedString;
use serde_json::Value as JsonValue;
/// Possible error that may occur while parsing a field value
@@ -170,23 +170,15 @@ impl FieldType {
}
},
JsonValue::Object(_) => match *self {
FieldType::Str(ref text_options) => {
if text_options.is_tokenized() {
if let Ok(tok_str_val) =
serde_json::from_value::<TokenizedString>(json.clone())
{
Ok(Value::TokStr(tok_str_val))
} else {
let msg = format!(
"Json value {:?} cannot be translated to TokenizedString.",
json
);
Err(ValueParsingError::TypeError(msg))
}
FieldType::Str(_) => {
if let Ok(tok_str_val) =
serde_json::from_value::<PreTokenizedString>(json.clone())
{
Ok(Value::PreTokStr(tok_str_val))
} else {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json, self
"Json value {:?} cannot be translated to PreTokenizedString.",
json
);
Err(ValueParsingError::TypeError(msg))
}
@@ -214,7 +206,9 @@ impl FieldType {
mod tests {
use super::FieldType;
use crate::schema::field_type::ValueParsingError;
use crate::schema::TextOptions;
use crate::schema::Value;
use crate::tokenizer::{PreTokenizedString, Token};
#[test]
fn test_bytes_value_from_json() {
@@ -235,4 +229,71 @@ mod tests {
_ => panic!("Expected parse failure for invalid base64"),
}
}
#[test]
fn test_pre_tok_str_value_from_json() {
let pre_tokenized_string_json = r#"{
"text": "The Old Man",
"tokens": [
{
"offset_from": 0,
"offset_to": 3,
"position": 0,
"text": "The",
"position_length": 1
},
{
"offset_from": 4,
"offset_to": 7,
"position": 1,
"text": "Old",
"position_length": 1
},
{
"offset_from": 8,
"offset_to": 11,
"position": 2,
"text": "Man",
"position_length": 1
}
]
}"#;
let expected_value = Value::PreTokStr(PreTokenizedString {
text: String::from("The Old Man"),
tokens: vec![
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: String::from("The"),
position_length: 1,
},
Token {
offset_from: 4,
offset_to: 7,
position: 1,
text: String::from("Old"),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 11,
position: 2,
text: String::from("Man"),
position_length: 1,
},
],
});
let deserialized_value = FieldType::Str(TextOptions::default())
.value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
.unwrap();
assert_eq!(deserialized_value, expected_value);
let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap();
assert_eq!(serialized_value_json, pre_tokenized_string_json);
}
}

View File

@@ -141,7 +141,6 @@ pub use self::text_options::TextFieldIndexing;
pub use self::text_options::TextOptions;
pub use self::text_options::STRING;
pub use self::text_options::TEXT;
pub use self::text_options::TOKENIZED;
pub use self::flags::{FAST, INDEXED, STORED};
pub use self::int_options::Cardinality;

View File

@@ -443,8 +443,7 @@ mod tests {
"record": "position",
"tokenizer": "default"
},
"stored": false,
"tokenized": false
"stored": false
}
},
{
@@ -455,8 +454,7 @@ mod tests {
"record": "basic",
"tokenizer": "raw"
},
"stored": false,
"tokenized": false
"stored": false
}
},
{

View File

@@ -9,7 +9,6 @@ use std::ops::BitOr;
pub struct TextOptions {
indexing: Option<TextFieldIndexing>,
stored: bool,
tokenized: bool,
}
impl TextOptions {
@@ -34,17 +33,6 @@ impl TextOptions {
self.indexing = Some(indexing);
self
}
/// Returns true if the text is already tokenized in the form of TokenString
pub fn is_tokenized(&self) -> bool {
self.tokenized
}
/// Sets the field as already tokenized
pub fn set_tokenized(mut self) -> TextOptions {
self.tokenized = true;
self
}
}
impl Default for TextOptions {
@@ -52,7 +40,6 @@ impl Default for TextOptions {
TextOptions {
indexing: None,
stored: false,
tokenized: false,
}
}
}
@@ -113,7 +100,6 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
tokenized: false,
};
/// The field will be tokenized and indexed
@@ -123,14 +109,6 @@ pub const TEXT: TextOptions = TextOptions {
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
tokenized: false,
};
/// The field is already tokenized, should come as TokenizedString
pub const TOKENIZED: TextOptions = TextOptions {
indexing: None,
stored: false,
tokenized: true,
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -141,7 +119,6 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
let mut res = TextOptions::default();
res.indexing = self.indexing.or(other.indexing);
res.stored = self.stored | other.stored;
res.tokenized = self.tokenized | other.tokenized;
res
}
}
@@ -157,7 +134,6 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
tokenized: false,
}
}
}
@@ -182,14 +158,8 @@ mod tests {
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_tokenized());
assert!(field_options.get_indexing_options().is_some());
}
{
let field_options = STORED | TOKENIZED;
assert!(field_options.is_stored());
assert!(field_options.is_tokenized());
}
{
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("body", TEXT);

View File

@@ -1,5 +1,5 @@
use crate::schema::Facet;
use crate::tokenizer::TokenizedString;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -11,8 +11,8 @@ use std::{cmp::Ordering, fmt};
pub enum Value {
/// The str type is used for any text information.
Str(String),
/// Tokenized str type,
TokStr(TokenizedString),
/// Pre-tokenized str type,
PreTokStr(PreTokenizedString),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
@@ -32,7 +32,7 @@ impl Ord for Value {
fn cmp(&self, other: &Self) -> Ordering {
match (self, other) {
(Value::Str(l), Value::Str(r)) => l.cmp(r),
(Value::TokStr(l), Value::TokStr(r)) => l.cmp(r),
(Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r),
(Value::U64(l), Value::U64(r)) => l.cmp(r),
(Value::I64(l), Value::I64(r)) => l.cmp(r),
(Value::Date(l), Value::Date(r)) => l.cmp(r),
@@ -48,8 +48,8 @@ impl Ord for Value {
}
(Value::Str(_), _) => Ordering::Less,
(_, Value::Str(_)) => Ordering::Greater,
(Value::TokStr(_), _) => Ordering::Less,
(_, Value::TokStr(_)) => Ordering::Greater,
(Value::PreTokStr(_), _) => Ordering::Less,
(_, Value::PreTokStr(_)) => Ordering::Greater,
(Value::U64(_), _) => Ordering::Less,
(_, Value::U64(_)) => Ordering::Greater,
(Value::I64(_), _) => Ordering::Less,
@@ -71,7 +71,7 @@ impl Serialize for Value {
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
Value::TokStr(ref v) => v.serialize(serializer),
Value::PreTokStr(ref v) => v.serialize(serializer),
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
Value::F64(u) => serializer.serialize_f64(u),
@@ -131,11 +131,11 @@ impl Value {
}
}
/// Returns the tokenized text, provided the value is of the `TokStr` type.
/// (Returns None if the value is not of the `TokStr` type).
pub fn tokenized_text(&self) -> Option<&TokenizedString> {
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
/// (Returns None if the value is not of the `PreTokStr` type).
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
match *self {
Value::TokStr(ref tok_text) => Some(tok_text),
Value::PreTokStr(ref tok_text) => Some(tok_text),
_ => None,
}
}
@@ -237,7 +237,7 @@ mod binary_serialize {
use super::Value;
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
use crate::schema::Facet;
use crate::tokenizer::TokenizedString;
use crate::tokenizer::PreTokenizedString;
use chrono::{TimeZone, Utc};
use std::io::{self, Read, Write};
@@ -261,7 +261,7 @@ mod binary_serialize {
TEXT_CODE.serialize(writer)?;
text.serialize(writer)
}
Value::TokStr(ref tok_str) => {
Value::PreTokStr(ref tok_str) => {
EXT_CODE.serialize(writer)?;
TOK_STR_CODE.serialize(writer)?;
if let Ok(text) = serde_json::to_string(tok_str) {
@@ -269,7 +269,7 @@ mod binary_serialize {
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to dump Value::TokStr(_) to json.",
"Failed to dump Value::PreTokStr(_) to json.",
))
}
}
@@ -329,12 +329,13 @@ mod binary_serialize {
match ext_type_code {
TOK_STR_CODE => {
let str_val = String::deserialize(reader)?;
if let Ok(value) = serde_json::from_str::<TokenizedString>(&str_val) {
Ok(Value::TokStr(value))
if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
{
Ok(Value::PreTokStr(value))
} else {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to parse string data as Value::TokStr(_).",
"Failed to parse string data as Value::PreTokStr(_).",
))
}
}

View File

@@ -153,7 +153,7 @@ pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenized_string::{TokenizedStream, TokenizedString};
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -1,49 +1,49 @@
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
use std::cmp::Ordering;
/// Struct representing tokenized text
/// Struct representing pre-tokenized text
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct TokenizedString {
pub struct PreTokenizedString {
/// Original text
pub text: String,
/// Tokens derived from the text
pub tokens: Vec<Token>,
}
impl Ord for TokenizedString {
impl Ord for PreTokenizedString {
fn cmp(&self, other: &Self) -> Ordering {
self.text.cmp(&other.text)
}
}
impl PartialOrd for TokenizedString {
impl PartialOrd for PreTokenizedString {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// TokenStream implementation which wraps TokenizedString
pub struct TokenizedStream {
tokenized_string: TokenizedString,
/// TokenStream implementation which wraps PreTokenizedString
pub struct PreTokenizedStream {
tokenized_string: PreTokenizedString,
current_token: i64,
}
impl From<TokenizedString> for TokenizedStream {
fn from(s: TokenizedString) -> TokenizedStream {
TokenizedStream {
impl From<PreTokenizedString> for PreTokenizedStream {
fn from(s: PreTokenizedString) -> PreTokenizedStream {
PreTokenizedStream {
tokenized_string: s,
current_token: -1,
}
}
}
impl TokenizedStream {
/// Creates a TokenStream from TokenizedString array
impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&'a TokenizedString],
tok_strings: &'a [&'a PreTokenizedString],
) -> Box<dyn TokenStream + 'a> {
if tok_strings.len() == 1 {
Box::new(TokenizedStream::from((*tok_strings[0]).clone()))
Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()))
} else {
let mut offsets = vec![];
let mut total_offset = 0;
@@ -57,14 +57,14 @@ impl TokenizedStream {
}
let token_streams: Vec<_> = tok_strings
.iter()
.map(|tok_string| TokenizedStream::from((*tok_string).clone()))
.map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
.collect();
Box::new(TokenStreamChain::new(offsets, token_streams))
}
}
}
impl TokenStream for TokenizedStream {
impl TokenStream for PreTokenizedStream {
fn advance(&mut self) -> bool {
self.current_token += 1;
self.current_token < self.tokenized_string.tokens.len() as i64
@@ -94,7 +94,7 @@ mod tests {
#[test]
fn test_tokenized_stream() {
let tok_text = TokenizedString {
let tok_text = PreTokenizedString {
text: String::from("A a"),
tokens: vec![
Token {
@@ -114,7 +114,7 @@ mod tests {
],
};
let mut tok_stream = TokenizedStream::from(tok_text.clone());
let mut tok_stream = PreTokenizedStream::from(tok_text.clone());
let mut i = 0;
while tok_stream.advance() {
@@ -125,7 +125,7 @@ mod tests {
#[test]
fn test_chain_tokenized_strings() {
let tok_text = TokenizedString {
let tok_text = PreTokenizedString {
text: String::from("A a"),
tokens: vec![
Token {
@@ -147,7 +147,7 @@ mod tests {
let chain_parts = vec![&tok_text, &tok_text];
let mut tok_stream = TokenizedStream::chain_tokenized_strings(&chain_parts[..]);
let mut tok_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
let expected_tokens = vec![
Token {