diff --git a/CHANGELOG.md b/CHANGELOG.md index cd0948ded..758449373 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ Tantivy 0.11.0 - Add footer with some metadata to index files. #605 (@fdb-hiroshima) - TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) - Added handling of pre-tokenized text fields (#642), which will enable users to - load tokens created outside tantivy. See usage in examples/pre_tokenized_text. + load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara) ## How to update? diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index 601064980..af3f3a981 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -9,7 +9,7 @@ // - import tokenized text straight from json, // - perform a search on documents with pre-tokenized text -use tantivy::tokenizer::{SimpleTokenizer, Token, TokenStream, TokenizedString, Tokenizer}; +use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; use tantivy::collector::{Count, TopDocs}; use tantivy::query::TermQuery; @@ -17,11 +17,11 @@ use tantivy::schema::*; use tantivy::{doc, Index, ReloadPolicy}; use tempfile::TempDir; -fn tokenize_it(text: &str) -> Vec { - let mut ts = SimpleTokenizer.token_stream(text); +fn pre_tokenize_text(text: &str) -> Vec { + let mut token_stream = SimpleTokenizer.token_stream(text); let mut tokens = vec![]; - while ts.advance() { - tokens.push(ts.token().clone()); + while token_stream.advance() { + tokens.push(token_stream.token().clone()); } tokens } @@ -31,11 +31,8 @@ fn main() -> tantivy::Result<()> { let mut schema_builder = Schema::builder(); - // now we add `TOKENIZED` `TextOptions` to mark field as pre-tokenized - // in addition the title will be also stored, so we can see it in - // returned results - schema_builder.add_text_field("title", TEXT | STORED | TOKENIZED); - schema_builder.add_text_field("body", TEXT | TOKENIZED); + schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); @@ -52,10 +49,10 @@ fn main() -> tantivy::Result<()> { let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream"; // Content of our first document - // We create `TokenizedString` which contains original text and vector of tokens - let title_tok = TokenizedString { + // We create `PreTokenizedString` which contains original text and vector of tokens + let title_tok = PreTokenizedString { text: String::from(title_text), - tokens: tokenize_it(title_text), + tokens: pre_tokenize_text(title_text), }; println!( @@ -63,21 +60,21 @@ fn main() -> tantivy::Result<()> { title_tok.text, title_tok.tokens ); - let body_tok = TokenizedString { + let body_tok = PreTokenizedString { text: String::from(body_text), - tokens: tokenize_it(body_text), + tokens: pre_tokenize_text(body_text), }; - // Now lets create a document and add our `TokenizedString` using - // `add_tokenized_text` method of `Document` + // Now lets create a document and add our `PreTokenizedString` using + // `add_pre_tokenized_text` method of `Document` let mut old_man_doc = Document::default(); - old_man_doc.add_tokenized_text(title, &title_tok); - old_man_doc.add_tokenized_text(body, &body_tok); + old_man_doc.add_pre_tokenized_text(title, &title_tok); + old_man_doc.add_pre_tokenized_text(body, &body_tok); // ... now let's just add it to the IndexWriter index_writer.add_document(old_man_doc); - // `Document` can be obtained directly from JSON: + // Pretokenized text can also be fed as JSON let short_man_json = r#"{ "title":[{ "text":"The Old Man", @@ -106,6 +103,8 @@ fn main() -> tantivy::Result<()> { let searcher = reader.searcher(); // We want to get documents with token "Man", we will use TermQuery to do it + // Using PreTokenizedString means the tokens are stored as is avoiding stemming + // and lowercasing, which preserves full words in their original form let query = TermQuery::new( Term::from_field_text(title, "Man"), IndexRecordOption::Basic, @@ -124,14 +123,14 @@ fn main() -> tantivy::Result<()> { // In contrary to the previous query, when we search for the "man" term we // should get no results, as it's not one of the indexed tokens. SimpleTokenizer - // only splits text on whitespace / interpunction. + // only splits text on whitespace / punctuation. let query = TermQuery::new( - Term::from_field_text(title, "nan"), + Term::from_field_text(title, "man"), IndexRecordOption::Basic, ); - let (top_docs, count) = searcher + let (_top_docs, count) = searcher .search(&query, &(TopDocs::with_limit(2), Count)) .unwrap(); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 1a992bc66..a1be67fee 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -285,6 +285,6 @@ mod tests { payload: None, }; let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); - assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false,"tokenized":false}}],"opstamp":0}"#); + assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#); } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 0b42f7e58..1d4f900fa 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -13,8 +13,8 @@ use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::BoxedTokenizer; use crate::tokenizer::FacetTokenizer; -use crate::tokenizer::{TokenStream, Tokenizer}; -use crate::tokenizer::{TokenizedStream, TokenizedString}; +use crate::tokenizer::PreTokenizedStream; +use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer}; use crate::DocId; use crate::Opstamp; use crate::Result; @@ -158,47 +158,46 @@ impl SegmentWriter { } } } - FieldType::Str(ref text_options) => { - let num_tokens = if text_options.is_tokenized() { - let tok_strings: Vec<&TokenizedString> = field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::TokStr(ref tok_str) => Some(tok_str), - _ => None, - }) - .collect(); - if tok_strings.is_empty() { - 0 - } else { - let mut token_stream = - TokenizedStream::chain_tokenized_strings(&tok_strings[..]); - self.multifield_postings - .index_text(doc_id, field, &mut token_stream) - } - } else { - if let Some(ref mut tokenizer) = self.tokenizers[field.field_id() as usize] - { - let texts: Vec<&str> = field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::Str(ref text) => Some(text.as_str()), - _ => None, - }) - .collect(); - if texts.is_empty() { - 0 - } else { - let mut token_stream = tokenizer.token_stream_texts(&texts[..]); - self.multifield_postings.index_text( - doc_id, - field, - &mut token_stream, - ) + FieldType::Str(_) => { + let mut token_streams: Vec> = vec![]; + let mut offsets = vec![]; + let mut total_offset = 0; + + for field_value in field_values { + match field_value.value() { + Value::PreTokStr(tok_str) => { + offsets.push(total_offset); + total_offset += match tok_str.tokens.last() { + Some(token) => token.offset_to, + None => 0, + }; + + token_streams + .push(Box::new(PreTokenizedStream::from(tok_str.clone()))); } - } else { - 0 + Value::Str(ref text) => { + if let Some(ref mut tokenizer) = + self.tokenizers[field.field_id() as usize] + { + offsets.push(total_offset); + total_offset += text.len(); + + token_streams.push(tokenizer.token_stream(text)); + } + } + _ => (), } + } + + let num_tokens = if token_streams.is_empty() { + 0 + } else { + let mut token_stream: Box = + Box::new(TokenStreamChain::new(offsets, token_streams)); + self.multifield_postings + .index_text(doc_id, field, &mut token_stream) }; + self.fieldnorms_writer.record(doc_id, field, num_tokens); } FieldType::U64(ref int_option) => { diff --git a/src/schema/document.rs b/src/schema/document.rs index 6e6798cc3..f9d0ceb98 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,7 +1,7 @@ use super::*; use crate::common::BinarySerializable; use crate::common::VInt; -use crate::tokenizer::TokenizedString; +use crate::tokenizer::PreTokenizedString; use crate::DateTime; use itertools::Itertools; use std::io::{self, Read, Write}; @@ -79,9 +79,13 @@ impl Document { self.add(FieldValue::new(field, value)); } - /// Add a text field with tokens. - pub fn add_tokenized_text(&mut self, field: Field, tokenized_text: &TokenizedString) { - let value = Value::TokStr(tokenized_text.clone()); + /// Add a pre-tokenized text field. + pub fn add_pre_tokenized_text( + &mut self, + field: Field, + pre_tokenized_text: &PreTokenizedString, + ) { + let value = Value::PreTokStr(pre_tokenized_text.clone()); self.add(FieldValue::new(field, value)); } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index e4630fbf8..0c3e5f849 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -280,8 +280,7 @@ mod tests { "record": "position", "tokenizer": "default" }, - "stored": false, - "tokenized": false + "stored": false } }"#; let field_value_json = serde_json::to_string_pretty(&field_value).unwrap(); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index d311d4934..27aaee8fc 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -5,7 +5,7 @@ use crate::schema::IndexRecordOption; use crate::schema::TextFieldIndexing; use crate::schema::Value; use crate::schema::{IntOptions, TextOptions}; -use crate::tokenizer::TokenizedString; +use crate::tokenizer::PreTokenizedString; use serde_json::Value as JsonValue; /// Possible error that may occur while parsing a field value @@ -170,23 +170,15 @@ impl FieldType { } }, JsonValue::Object(_) => match *self { - FieldType::Str(ref text_options) => { - if text_options.is_tokenized() { - if let Ok(tok_str_val) = - serde_json::from_value::(json.clone()) - { - Ok(Value::TokStr(tok_str_val)) - } else { - let msg = format!( - "Json value {:?} cannot be translated to TokenizedString.", - json - ); - Err(ValueParsingError::TypeError(msg)) - } + FieldType::Str(_) => { + if let Ok(tok_str_val) = + serde_json::from_value::(json.clone()) + { + Ok(Value::PreTokStr(tok_str_val)) } else { let msg = format!( - "Json value not supported error {:?}. Expected {:?}", - json, self + "Json value {:?} cannot be translated to PreTokenizedString.", + json ); Err(ValueParsingError::TypeError(msg)) } @@ -214,7 +206,9 @@ impl FieldType { mod tests { use super::FieldType; use crate::schema::field_type::ValueParsingError; + use crate::schema::TextOptions; use crate::schema::Value; + use crate::tokenizer::{PreTokenizedString, Token}; #[test] fn test_bytes_value_from_json() { @@ -235,4 +229,71 @@ mod tests { _ => panic!("Expected parse failure for invalid base64"), } } + + #[test] + fn test_pre_tok_str_value_from_json() { + let pre_tokenized_string_json = r#"{ + "text": "The Old Man", + "tokens": [ + { + "offset_from": 0, + "offset_to": 3, + "position": 0, + "text": "The", + "position_length": 1 + }, + { + "offset_from": 4, + "offset_to": 7, + "position": 1, + "text": "Old", + "position_length": 1 + }, + { + "offset_from": 8, + "offset_to": 11, + "position": 2, + "text": "Man", + "position_length": 1 + } + ] +}"#; + + let expected_value = Value::PreTokStr(PreTokenizedString { + text: String::from("The Old Man"), + tokens: vec![ + Token { + offset_from: 0, + offset_to: 3, + position: 0, + text: String::from("The"), + position_length: 1, + }, + Token { + offset_from: 4, + offset_to: 7, + position: 1, + text: String::from("Old"), + position_length: 1, + }, + Token { + offset_from: 8, + offset_to: 11, + position: 2, + text: String::from("Man"), + position_length: 1, + }, + ], + }); + + let deserialized_value = FieldType::Str(TextOptions::default()) + .value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap()) + .unwrap(); + + assert_eq!(deserialized_value, expected_value); + + let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap(); + + assert_eq!(serialized_value_json, pre_tokenized_string_json); + } } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 374f1c83b..0e669d115 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -141,7 +141,6 @@ pub use self::text_options::TextFieldIndexing; pub use self::text_options::TextOptions; pub use self::text_options::STRING; pub use self::text_options::TEXT; -pub use self::text_options::TOKENIZED; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::int_options::Cardinality; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 165426359..db10036ae 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -443,8 +443,7 @@ mod tests { "record": "position", "tokenizer": "default" }, - "stored": false, - "tokenized": false + "stored": false } }, { @@ -455,8 +454,7 @@ mod tests { "record": "basic", "tokenizer": "raw" }, - "stored": false, - "tokenized": false + "stored": false } }, { diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index f0322f6a8..11ab8accd 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -9,7 +9,6 @@ use std::ops::BitOr; pub struct TextOptions { indexing: Option, stored: bool, - tokenized: bool, } impl TextOptions { @@ -34,17 +33,6 @@ impl TextOptions { self.indexing = Some(indexing); self } - - /// Returns true if the text is already tokenized in the form of TokenString - pub fn is_tokenized(&self) -> bool { - self.tokenized - } - - /// Sets the field as already tokenized - pub fn set_tokenized(mut self) -> TextOptions { - self.tokenized = true; - self - } } impl Default for TextOptions { @@ -52,7 +40,6 @@ impl Default for TextOptions { TextOptions { indexing: None, stored: false, - tokenized: false, } } } @@ -113,7 +100,6 @@ pub const STRING: TextOptions = TextOptions { record: IndexRecordOption::Basic, }), stored: false, - tokenized: false, }; /// The field will be tokenized and indexed @@ -123,14 +109,6 @@ pub const TEXT: TextOptions = TextOptions { record: IndexRecordOption::WithFreqsAndPositions, }), stored: false, - tokenized: false, -}; - -/// The field is already tokenized, should come as TokenizedString -pub const TOKENIZED: TextOptions = TextOptions { - indexing: None, - stored: false, - tokenized: true, }; impl> BitOr for TextOptions { @@ -141,7 +119,6 @@ impl> BitOr for TextOptions { let mut res = TextOptions::default(); res.indexing = self.indexing.or(other.indexing); res.stored = self.stored | other.stored; - res.tokenized = self.tokenized | other.tokenized; res } } @@ -157,7 +134,6 @@ impl From for TextOptions { TextOptions { indexing: None, stored: true, - tokenized: false, } } } @@ -182,14 +158,8 @@ mod tests { { let field_options = STORED | TEXT; assert!(field_options.is_stored()); - assert!(!field_options.is_tokenized()); assert!(field_options.get_indexing_options().is_some()); } - { - let field_options = STORED | TOKENIZED; - assert!(field_options.is_stored()); - assert!(field_options.is_tokenized()); - } { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("body", TEXT); diff --git a/src/schema/value.rs b/src/schema/value.rs index 7d4c1b73a..e8fedd68c 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,5 +1,5 @@ use crate::schema::Facet; -use crate::tokenizer::TokenizedString; +use crate::tokenizer::PreTokenizedString; use crate::DateTime; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -11,8 +11,8 @@ use std::{cmp::Ordering, fmt}; pub enum Value { /// The str type is used for any text information. Str(String), - /// Tokenized str type, - TokStr(TokenizedString), + /// Pre-tokenized str type, + PreTokStr(PreTokenizedString), /// Unsigned 64-bits Integer `u64` U64(u64), /// Signed 64-bits Integer `i64` @@ -32,7 +32,7 @@ impl Ord for Value { fn cmp(&self, other: &Self) -> Ordering { match (self, other) { (Value::Str(l), Value::Str(r)) => l.cmp(r), - (Value::TokStr(l), Value::TokStr(r)) => l.cmp(r), + (Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r), (Value::U64(l), Value::U64(r)) => l.cmp(r), (Value::I64(l), Value::I64(r)) => l.cmp(r), (Value::Date(l), Value::Date(r)) => l.cmp(r), @@ -48,8 +48,8 @@ impl Ord for Value { } (Value::Str(_), _) => Ordering::Less, (_, Value::Str(_)) => Ordering::Greater, - (Value::TokStr(_), _) => Ordering::Less, - (_, Value::TokStr(_)) => Ordering::Greater, + (Value::PreTokStr(_), _) => Ordering::Less, + (_, Value::PreTokStr(_)) => Ordering::Greater, (Value::U64(_), _) => Ordering::Less, (_, Value::U64(_)) => Ordering::Greater, (Value::I64(_), _) => Ordering::Less, @@ -71,7 +71,7 @@ impl Serialize for Value { { match *self { Value::Str(ref v) => serializer.serialize_str(v), - Value::TokStr(ref v) => v.serialize(serializer), + Value::PreTokStr(ref v) => v.serialize(serializer), Value::U64(u) => serializer.serialize_u64(u), Value::I64(u) => serializer.serialize_i64(u), Value::F64(u) => serializer.serialize_f64(u), @@ -131,11 +131,11 @@ impl Value { } } - /// Returns the tokenized text, provided the value is of the `TokStr` type. - /// (Returns None if the value is not of the `TokStr` type). - pub fn tokenized_text(&self) -> Option<&TokenizedString> { + /// Returns the tokenized text, provided the value is of the `PreTokStr` type. + /// (Returns None if the value is not of the `PreTokStr` type). + pub fn tokenized_text(&self) -> Option<&PreTokenizedString> { match *self { - Value::TokStr(ref tok_text) => Some(tok_text), + Value::PreTokStr(ref tok_text) => Some(tok_text), _ => None, } } @@ -237,7 +237,7 @@ mod binary_serialize { use super::Value; use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable}; use crate::schema::Facet; - use crate::tokenizer::TokenizedString; + use crate::tokenizer::PreTokenizedString; use chrono::{TimeZone, Utc}; use std::io::{self, Read, Write}; @@ -261,7 +261,7 @@ mod binary_serialize { TEXT_CODE.serialize(writer)?; text.serialize(writer) } - Value::TokStr(ref tok_str) => { + Value::PreTokStr(ref tok_str) => { EXT_CODE.serialize(writer)?; TOK_STR_CODE.serialize(writer)?; if let Ok(text) = serde_json::to_string(tok_str) { @@ -269,7 +269,7 @@ mod binary_serialize { } else { Err(io::Error::new( io::ErrorKind::Other, - "Failed to dump Value::TokStr(_) to json.", + "Failed to dump Value::PreTokStr(_) to json.", )) } } @@ -329,12 +329,13 @@ mod binary_serialize { match ext_type_code { TOK_STR_CODE => { let str_val = String::deserialize(reader)?; - if let Ok(value) = serde_json::from_str::(&str_val) { - Ok(Value::TokStr(value)) + if let Ok(value) = serde_json::from_str::(&str_val) + { + Ok(Value::PreTokStr(value)) } else { Err(io::Error::new( io::ErrorKind::Other, - "Failed to parse string data as Value::TokStr(_).", + "Failed to parse string data as Value::PreTokStr(_).", )) } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 5c1d70a71..72cf980a4 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,7 @@ pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub use self::tokenizer::BoxedTokenizer; -pub use self::tokenized_string::{TokenizedStream, TokenizedString}; +pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index dfb548956..86bb74c94 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -1,49 +1,49 @@ use crate::tokenizer::{Token, TokenStream, TokenStreamChain}; use std::cmp::Ordering; -/// Struct representing tokenized text +/// Struct representing pre-tokenized text #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -pub struct TokenizedString { +pub struct PreTokenizedString { /// Original text pub text: String, /// Tokens derived from the text pub tokens: Vec, } -impl Ord for TokenizedString { +impl Ord for PreTokenizedString { fn cmp(&self, other: &Self) -> Ordering { self.text.cmp(&other.text) } } -impl PartialOrd for TokenizedString { +impl PartialOrd for PreTokenizedString { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -/// TokenStream implementation which wraps TokenizedString -pub struct TokenizedStream { - tokenized_string: TokenizedString, +/// TokenStream implementation which wraps PreTokenizedString +pub struct PreTokenizedStream { + tokenized_string: PreTokenizedString, current_token: i64, } -impl From for TokenizedStream { - fn from(s: TokenizedString) -> TokenizedStream { - TokenizedStream { +impl From for PreTokenizedStream { + fn from(s: PreTokenizedString) -> PreTokenizedStream { + PreTokenizedStream { tokenized_string: s, current_token: -1, } } } -impl TokenizedStream { - /// Creates a TokenStream from TokenizedString array +impl PreTokenizedStream { + /// Creates a TokenStream from PreTokenizedString array pub fn chain_tokenized_strings<'a>( - tok_strings: &'a [&'a TokenizedString], + tok_strings: &'a [&'a PreTokenizedString], ) -> Box { if tok_strings.len() == 1 { - Box::new(TokenizedStream::from((*tok_strings[0]).clone())) + Box::new(PreTokenizedStream::from((*tok_strings[0]).clone())) } else { let mut offsets = vec![]; let mut total_offset = 0; @@ -57,14 +57,14 @@ impl TokenizedStream { } let token_streams: Vec<_> = tok_strings .iter() - .map(|tok_string| TokenizedStream::from((*tok_string).clone())) + .map(|tok_string| PreTokenizedStream::from((*tok_string).clone())) .collect(); Box::new(TokenStreamChain::new(offsets, token_streams)) } } } -impl TokenStream for TokenizedStream { +impl TokenStream for PreTokenizedStream { fn advance(&mut self) -> bool { self.current_token += 1; self.current_token < self.tokenized_string.tokens.len() as i64 @@ -94,7 +94,7 @@ mod tests { #[test] fn test_tokenized_stream() { - let tok_text = TokenizedString { + let tok_text = PreTokenizedString { text: String::from("A a"), tokens: vec![ Token { @@ -114,7 +114,7 @@ mod tests { ], }; - let mut tok_stream = TokenizedStream::from(tok_text.clone()); + let mut tok_stream = PreTokenizedStream::from(tok_text.clone()); let mut i = 0; while tok_stream.advance() { @@ -125,7 +125,7 @@ mod tests { #[test] fn test_chain_tokenized_strings() { - let tok_text = TokenizedString { + let tok_text = PreTokenizedString { text: String::from("A a"), tokens: vec![ Token { @@ -147,7 +147,7 @@ mod tests { let chain_parts = vec![&tok_text, &tok_text]; - let mut tok_stream = TokenizedStream::chain_tokenized_strings(&chain_parts[..]); + let mut tok_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); let expected_tokens = vec![ Token {