Minor changes

Merge branch 'master' into pre-tokenized-text
* Removed tokenized flag from TextOptions and code reliance on the flag.
2025-12-27 20:42:54 +00:00 · 2019-11-06 10:11:59 +09:00 · 2019-11-04 16:30:53 +09:00 · 2019-10-28 09:26:24 +01:00 · 2019-10-26 16:44:57 +02:00 · 2019-10-26 16:43:56 +02:00
9 changed files with 535 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ Tantivy 0.11.0
 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
 - Add footer with some metadata to index files. #605 (@fdb-hiroshima)
 - TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
+- Added handling of pre-tokenized text fields (#642), which will enable users to
+  load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara)
 - Fix crash when committing multiple times with deleted documents. #681 (@brainlock)

 ## How to update?
--- a/examples/pre_tokenized_text.rs
+++ b/examples/pre_tokenized_text.rs
@@ -0,0 +1,140 @@
+// # Pre-tokenized text example
+//
+// This example shows how to use pre-tokenized text. Sometimes yout might
+// want to index and search through text which is already split into
+// tokens by some external tool.
+//
+// In this example we will:
+// - use tantivy tokenizer to create tokens and load them directly into tantivy,
+// - import tokenized text straight from json,
+// - perform a search on documents with pre-tokenized text
+
+use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
+
+use tantivy::collector::{Count, TopDocs};
+use tantivy::query::TermQuery;
+use tantivy::schema::*;
+use tantivy::{doc, Index, ReloadPolicy};
+use tempfile::TempDir;
+
+fn pre_tokenize_text(text: &str) -> Vec<Token> {
+    let mut token_stream = SimpleTokenizer.token_stream(text);
+    let mut tokens = vec![];
+    while token_stream.advance() {
+        tokens.push(token_stream.token().clone());
+    }
+    tokens
+}
+
+fn main() -> tantivy::Result<()> {
+    let index_path = TempDir::new()?;
+
+    let mut schema_builder = Schema::builder();
+
+    schema_builder.add_text_field("title", TEXT | STORED);
+    schema_builder.add_text_field("body", TEXT);
+
+    let schema = schema_builder.build();
+
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // We can create a document manually, by setting the fields
+    // one by one in a Document object.
+    let title = schema.get_field("title").unwrap();
+    let body = schema.get_field("body").unwrap();
+
+    let title_text = "The Old Man and the Sea";
+    let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
+
+    // Content of our first document
+    // We create `PreTokenizedString` which contains original text and vector of tokens
+    let title_tok = PreTokenizedString {
+        text: String::from(title_text),
+        tokens: pre_tokenize_text(title_text),
+    };
+
+    println!(
+        "Original text: \"{}\" and tokens: {:?}",
+        title_tok.text, title_tok.tokens
+    );
+
+    let body_tok = PreTokenizedString {
+        text: String::from(body_text),
+        tokens: pre_tokenize_text(body_text),
+    };
+
+    // Now lets create a document and add our `PreTokenizedString` using
+    // `add_pre_tokenized_text` method of `Document`
+    let mut old_man_doc = Document::default();
+    old_man_doc.add_pre_tokenized_text(title, &title_tok);
+    old_man_doc.add_pre_tokenized_text(body, &body_tok);
+
+    // ... now let's just add it to the IndexWriter
+    index_writer.add_document(old_man_doc);
+
+    // Pretokenized text can also be fed as JSON
+    let short_man_json = r#"{
+        "title":[{
+            "text":"The Old Man",
+            "tokens":[
+                {"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
+                {"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
+                {"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
+            ]
+        }]
+    }"#;
+
+    let short_man_doc = schema.parse_document(&short_man_json)?;
+
+    index_writer.add_document(short_man_doc);
+
+    // Let's commit changes
+    index_writer.commit()?;
+
+    // ... and now is the time to query our index
+
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::OnCommit)
+        .try_into()?;
+
+    let searcher = reader.searcher();
+
+    // We want to get documents with token "Man", we will use TermQuery to do it
+    // Using PreTokenizedString means the tokens are stored as is avoiding stemming
+    // and lowercasing, which preserves full words in their original form
+    let query = TermQuery::new(
+        Term::from_field_text(title, "Man"),
+        IndexRecordOption::Basic,
+    );
+
+    let (top_docs, count) = searcher
+        .search(&query, &(TopDocs::with_limit(2), Count))
+        .unwrap();
+
+    assert_eq!(count, 2);
+
+    for (_score, doc_address) in top_docs {
+        let retrieved_doc = searcher.doc(doc_address)?;
+        println!("Document: {}", schema.to_json(&retrieved_doc));
+    }
+
+    // In contrary to the previous query, when we search for the "man" term we
+    // should get no results, as it's not one of the indexed tokens. SimpleTokenizer
+    // only splits text on whitespace / punctuation.
+
+    let query = TermQuery::new(
+        Term::from_field_text(title, "man"),
+        IndexRecordOption::Basic,
+    );
+
+    let (_top_docs, count) = searcher
+        .search(&query, &(TopDocs::with_limit(2), Count))
+        .unwrap();
+
+    assert_eq!(count, 0);
+
+    Ok(())
+}
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -13,7 +13,8 @@ use crate::schema::Value;
 use crate::schema::{Field, FieldEntry};
 use crate::tokenizer::BoxedTokenizer;
 use crate::tokenizer::FacetTokenizer;
-use crate::tokenizer::{TokenStream, Tokenizer};
+use crate::tokenizer::PreTokenizedStream;
+use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
 use crate::DocId;
 use crate::Opstamp;
 use crate::Result;
@@ -158,26 +159,43 @@ impl SegmentWriter {
                    }
                }
                FieldType::Str(_) => {
-                    let num_tokens = if let Some(ref mut tokenizer) =
-                        self.tokenizers[field.field_id() as usize]
-                    {
-                        let texts: Vec<&str> = field_values
-                            .iter()
-                            .flat_map(|field_value| match *field_value.value() {
-                                Value::Str(ref text) => Some(text.as_str()),
-                                _ => None,
-                            })
-                            .collect();
-                        if texts.is_empty() {
-                            0
-                        } else {
-                            let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
-                            self.multifield_postings
-                                .index_text(doc_id, field, &mut token_stream)
+                    let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
+                    let mut offsets = vec![];
+                    let mut total_offset = 0;
+
+                    for field_value in field_values {
+                        match field_value.value() {
+                            Value::PreTokStr(tok_str) => {
+                                offsets.push(total_offset);
+                                if let Some(last_token) = tok_str.tokens.last() {
+                                    total_offset += last_token.offset_to;
+                                }
+                                token_streams
+                                    .push(Box::new(PreTokenizedStream::from(tok_str.clone())));
+                            }
+                            Value::Str(ref text) => {
+                                if let Some(ref mut tokenizer) =
+                                    self.tokenizers[field.field_id() as usize]
+                                {
+                                    offsets.push(total_offset);
+                                    total_offset += text.len();
+
+                                    token_streams.push(tokenizer.token_stream(text));
+                                }
+                            }
+                            _ => (),
                        }
-                    } else {
+                    }
+
+                    let num_tokens = if token_streams.is_empty() {
                        0
+                    } else {
+                        let mut token_stream: Box<dyn TokenStream> =
+                            Box::new(TokenStreamChain::new(offsets, token_streams));
+                        self.multifield_postings
+                            .index_text(doc_id, field, &mut token_stream)
                    };
+
                    self.fieldnorms_writer.record(doc_id, field, num_tokens);
                }
                FieldType::U64(ref int_option) => {
--- a/src/schema/document.rs
+++ b/src/schema/document.rs
@@ -1,6 +1,7 @@
 use super::*;
 use crate::common::BinarySerializable;
 use crate::common::VInt;
+use crate::tokenizer::PreTokenizedString;
 use crate::DateTime;
 use itertools::Itertools;
 use std::io::{self, Read, Write};
@@ -78,6 +79,16 @@ impl Document {
        self.add(FieldValue::new(field, value));
    }

+    /// Add a pre-tokenized text field.
+    pub fn add_pre_tokenized_text(
+        &mut self,
+        field: Field,
+        pre_tokenized_text: &PreTokenizedString,
+    ) {
+        let value = Value::PreTokStr(pre_tokenized_text.clone());
+        self.add(FieldValue::new(field, value));
+    }
+
    /// Add a u64 field
    pub fn add_u64(&mut self, field: Field, value: u64) {
        self.add(FieldValue::new(field, Value::U64(value)));
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -1,11 +1,11 @@
 use base64::decode;

-use crate::schema::{IntOptions, TextOptions};
-
 use crate::schema::Facet;
 use crate::schema::IndexRecordOption;
 use crate::schema::TextFieldIndexing;
 use crate::schema::Value;
+use crate::schema::{IntOptions, TextOptions};
+use crate::tokenizer::PreTokenizedString;
 use serde_json::Value as JsonValue;

 /// Possible error that may occur while parsing a field value
@@ -169,6 +169,28 @@ impl FieldType {
                    Err(ValueParsingError::TypeError(msg))
                }
            },
+            JsonValue::Object(_) => match *self {
+                FieldType::Str(_) => {
+                    if let Ok(tok_str_val) =
+                        serde_json::from_value::<PreTokenizedString>(json.clone())
+                    {
+                        Ok(Value::PreTokStr(tok_str_val))
+                    } else {
+                        let msg = format!(
+                            "Json value {:?} cannot be translated to PreTokenizedString.",
+                            json
+                        );
+                        Err(ValueParsingError::TypeError(msg))
+                    }
+                }
+                _ => {
+                    let msg = format!(
+                        "Json value not supported error {:?}. Expected {:?}",
+                        json, self
+                    );
+                    Err(ValueParsingError::TypeError(msg))
+                }
+            },
            _ => {
                let msg = format!(
                    "Json value not supported error {:?}. Expected {:?}",
@@ -184,7 +206,9 @@ impl FieldType {
 mod tests {
    use super::FieldType;
    use crate::schema::field_type::ValueParsingError;
+    use crate::schema::TextOptions;
    use crate::schema::Value;
+    use crate::tokenizer::{PreTokenizedString, Token};

    #[test]
    fn test_bytes_value_from_json() {
@@ -205,4 +229,71 @@ mod tests {
            _ => panic!("Expected parse failure for invalid base64"),
        }
    }
+
+    #[test]
+    fn test_pre_tok_str_value_from_json() {
+        let pre_tokenized_string_json = r#"{
+  "text": "The Old Man",
+  "tokens": [
+    {
+      "offset_from": 0,
+      "offset_to": 3,
+      "position": 0,
+      "text": "The",
+      "position_length": 1
+    },
+    {
+      "offset_from": 4,
+      "offset_to": 7,
+      "position": 1,
+      "text": "Old",
+      "position_length": 1
+    },
+    {
+      "offset_from": 8,
+      "offset_to": 11,
+      "position": 2,
+      "text": "Man",
+      "position_length": 1
+    }
+  ]
+}"#;
+
+        let expected_value = Value::PreTokStr(PreTokenizedString {
+            text: String::from("The Old Man"),
+            tokens: vec![
+                Token {
+                    offset_from: 0,
+                    offset_to: 3,
+                    position: 0,
+                    text: String::from("The"),
+                    position_length: 1,
+                },
+                Token {
+                    offset_from: 4,
+                    offset_to: 7,
+                    position: 1,
+                    text: String::from("Old"),
+                    position_length: 1,
+                },
+                Token {
+                    offset_from: 8,
+                    offset_to: 11,
+                    position: 2,
+                    text: String::from("Man"),
+                    position_length: 1,
+                },
+            ],
+        });
+
+        let deserialized_value = FieldType::Str(TextOptions::default())
+            .value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
+            .unwrap();
+
+        assert_eq!(deserialized_value, expected_value);
+
+        let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap();
+
+        assert_eq!(serialized_value_json, pre_tokenized_string_json);
+    }
 }
--- a/src/schema/value.rs
+++ b/src/schema/value.rs
@@ -1,4 +1,5 @@
 use crate::schema::Facet;
+use crate::tokenizer::PreTokenizedString;
 use crate::DateTime;
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -10,6 +11,8 @@ use std::{cmp::Ordering, fmt};
 pub enum Value {
    /// The str type is used for any text information.
    Str(String),
+    /// Pre-tokenized str type,
+    PreTokStr(PreTokenizedString),
    /// Unsigned 64-bits Integer `u64`
    U64(u64),
    /// Signed 64-bits Integer `i64`
@@ -29,6 +32,7 @@ impl Ord for Value {
    fn cmp(&self, other: &Self) -> Ordering {
        match (self, other) {
            (Value::Str(l), Value::Str(r)) => l.cmp(r),
+            (Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r),
            (Value::U64(l), Value::U64(r)) => l.cmp(r),
            (Value::I64(l), Value::I64(r)) => l.cmp(r),
            (Value::Date(l), Value::Date(r)) => l.cmp(r),
@@ -44,6 +48,8 @@ impl Ord for Value {
            }
            (Value::Str(_), _) => Ordering::Less,
            (_, Value::Str(_)) => Ordering::Greater,
+            (Value::PreTokStr(_), _) => Ordering::Less,
+            (_, Value::PreTokStr(_)) => Ordering::Greater,
            (Value::U64(_), _) => Ordering::Less,
            (_, Value::U64(_)) => Ordering::Greater,
            (Value::I64(_), _) => Ordering::Less,
@@ -65,6 +71,7 @@ impl Serialize for Value {
    {
        match *self {
            Value::Str(ref v) => serializer.serialize_str(v),
+            Value::PreTokStr(ref v) => v.serialize(serializer),
            Value::U64(u) => serializer.serialize_u64(u),
            Value::I64(u) => serializer.serialize_i64(u),
            Value::F64(u) => serializer.serialize_f64(u),
@@ -124,6 +131,15 @@ impl Value {
        }
    }

+    /// Returns the tokenized text, provided the value is of the `PreTokStr` type.
+    /// (Returns None if the value is not of the `PreTokStr` type).
+    pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
+        match *self {
+            Value::PreTokStr(ref tok_text) => Some(tok_text),
+            _ => None,
+        }
+    }
+
    /// Returns the u64-value, provided the value is of the `U64` type.
    ///
    /// # Panics
@@ -221,6 +237,7 @@ mod binary_serialize {
    use super::Value;
    use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
    use crate::schema::Facet;
+    use crate::tokenizer::PreTokenizedString;
    use chrono::{TimeZone, Utc};
    use std::io::{self, Read, Write};

@@ -231,6 +248,11 @@ mod binary_serialize {
    const BYTES_CODE: u8 = 4;
    const DATE_CODE: u8 = 5;
    const F64_CODE: u8 = 6;
+    const EXT_CODE: u8 = 7;
+
+    // extended types
+
+    const TOK_STR_CODE: u8 = 0;

    impl BinarySerializable for Value {
        fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
@@ -239,6 +261,18 @@ mod binary_serialize {
                    TEXT_CODE.serialize(writer)?;
                    text.serialize(writer)
                }
+                Value::PreTokStr(ref tok_str) => {
+                    EXT_CODE.serialize(writer)?;
+                    TOK_STR_CODE.serialize(writer)?;
+                    if let Ok(text) = serde_json::to_string(tok_str) {
+                        text.serialize(writer)
+                    } else {
+                        Err(io::Error::new(
+                            io::ErrorKind::Other,
+                            "Failed to dump Value::PreTokStr(_) to json.",
+                        ))
+                    }
+                }
                Value::U64(ref val) => {
                    U64_CODE.serialize(writer)?;
                    val.serialize(writer)
@@ -290,6 +324,30 @@ mod binary_serialize {
                }
                HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
                BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
+                EXT_CODE => {
+                    let ext_type_code = u8::deserialize(reader)?;
+                    match ext_type_code {
+                        TOK_STR_CODE => {
+                            let str_val = String::deserialize(reader)?;
+                            if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
+                            {
+                                Ok(Value::PreTokStr(value))
+                            } else {
+                                Err(io::Error::new(
+                                    io::ErrorKind::Other,
+                                    "Failed to parse string data as Value::PreTokStr(_).",
+                                ))
+                            }
+                        }
+                        _ => Err(io::Error::new(
+                            io::ErrorKind::InvalidData,
+                            format!(
+                                "No extened field type is associated with code {:?}",
+                                ext_type_code
+                            ),
+                        )),
+                    }
+                }
                _ => Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("No field type is associated with code {:?}", type_code),
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -136,6 +136,7 @@ mod simple_tokenizer;
 mod stemmer;
 mod stop_word_filter;
 mod token_stream_chain;
+mod tokenized_string;
 mod tokenizer;
 mod tokenizer_manager;

@@ -152,7 +153,9 @@ pub use self::stop_word_filter::StopWordFilter;
 pub(crate) use self::token_stream_chain::TokenStreamChain;
 pub use self::tokenizer::BoxedTokenizer;

+pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
+
 pub use self::tokenizer_manager::TokenizerManager;

 /// Maximum authorized len (in bytes) for a token.
--- a/src/tokenizer/tokenized_string.rs
+++ b/src/tokenizer/tokenized_string.rs
@@ -0,0 +1,191 @@
+use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
+use std::cmp::Ordering;
+
+/// Struct representing pre-tokenized text
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+pub struct PreTokenizedString {
+    /// Original text
+    pub text: String,
+    /// Tokens derived from the text
+    pub tokens: Vec<Token>,
+}
+
+impl Ord for PreTokenizedString {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.text.cmp(&other.text)
+    }
+}
+
+impl PartialOrd for PreTokenizedString {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+/// TokenStream implementation which wraps PreTokenizedString
+pub struct PreTokenizedStream {
+    tokenized_string: PreTokenizedString,
+    current_token: i64,
+}
+
+impl From<PreTokenizedString> for PreTokenizedStream {
+    fn from(s: PreTokenizedString) -> PreTokenizedStream {
+        PreTokenizedStream {
+            tokenized_string: s,
+            current_token: -1,
+        }
+    }
+}
+
+impl PreTokenizedStream {
+    /// Creates a TokenStream from PreTokenizedString array
+    pub fn chain_tokenized_strings<'a>(
+        tok_strings: &'a [&'a PreTokenizedString],
+    ) -> Box<dyn TokenStream + 'a> {
+        if tok_strings.len() == 1 {
+            return Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()));
+        }
+        let mut offsets = vec![];
+        let mut total_offset = 0;
+        for &tok_string in tok_strings {
+            offsets.push(total_offset);
+            if let Some(last_token) = tok_string.tokens.last() {
+                total_offset += last_token.offset_to;
+            }
+        }
+        let token_streams: Vec<_> = tok_strings
+            .iter()
+            .map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
+            .collect();
+        Box::new(TokenStreamChain::new(offsets, token_streams))
+    }
+}
+
+impl TokenStream for PreTokenizedStream {
+    fn advance(&mut self) -> bool {
+        if self.current_token >= self.tokenized_string.tokens.len() as i64 - 1 {
+            // This was our last token.
+            return false;
+        }
+        self.current_token += 1;
+        true
+    }
+
+    fn token(&self) -> &Token {
+        assert!(
+            self.current_token >= 0,
+            "TokenStream not initialized. You should call advance() at least once."
+        );
+        &self.tokenized_string.tokens[self.current_token as usize]
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        assert!(
+            self.current_token >= 0,
+            "TokenStream not initialized. You should call advance() at least once."
+        );
+        &mut self.tokenized_string.tokens[self.current_token as usize]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    use crate::tokenizer::Token;
+
+    #[test]
+    fn test_tokenized_stream() {
+        let tok_text = PreTokenizedString {
+            text: String::from("A a"),
+            tokens: vec![
+                Token {
+                    offset_from: 0,
+                    offset_to: 1,
+                    position: 0,
+                    text: String::from("A"),
+                    position_length: 1,
+                },
+                Token {
+                    offset_from: 2,
+                    offset_to: 3,
+                    position: 1,
+                    text: String::from("a"),
+                    position_length: 1,
+                },
+            ],
+        };
+
+        let mut tok_stream = PreTokenizedStream::from(tok_text.clone());
+
+        let mut i = 0;
+        while tok_stream.advance() {
+            assert!(*tok_stream.token() == tok_text.tokens[i]);
+            i += 1;
+        }
+    }
+
+    #[test]
+    fn test_chain_tokenized_strings() {
+        let tok_text = PreTokenizedString {
+            text: String::from("A a"),
+            tokens: vec![
+                Token {
+                    offset_from: 0,
+                    offset_to: 1,
+                    position: 0,
+                    text: String::from("A"),
+                    position_length: 1,
+                },
+                Token {
+                    offset_from: 2,
+                    offset_to: 3,
+                    position: 1,
+                    text: String::from("a"),
+                    position_length: 1,
+                },
+            ],
+        };
+
+        let chain_parts = vec![&tok_text, &tok_text];
+
+        let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
+
+        let expected_tokens = vec![
+            Token {
+                offset_from: 0,
+                offset_to: 1,
+                position: 0,
+                text: String::from("A"),
+                position_length: 1,
+            },
+            Token {
+                offset_from: 2,
+                offset_to: 3,
+                position: 1,
+                text: String::from("a"),
+                position_length: 1,
+            },
+            Token {
+                offset_from: 3,
+                offset_to: 4,
+                position: 3,
+                text: String::from("A"),
+                position_length: 1,
+            },
+            Token {
+                offset_from: 5,
+                offset_to: 6,
+                position: 4,
+                text: String::from("a"),
+                position_length: 1,
+            },
+        ];
+        for expected_token in expected_tokens {
+            assert!(token_stream.advance());
+            assert_eq!(token_stream.token(), &expected_token);
+        }
+        assert!(!token_stream.advance());
+    }
+}
--- a/src/tokenizer/tokenizer.rs
+++ b/src/tokenizer/tokenizer.rs
@@ -4,7 +4,7 @@ use crate::tokenizer::TokenStreamChain;
 use std::borrow::{Borrow, BorrowMut};

 /// Token
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
 pub struct Token {
    /// Offset (byte index) of the first character of the token.
    /// Offsets shall not be modified by token filters.
Author	SHA1	Message	Date
Paul Masurel	69e8495724	Minor changes	2019-11-06 10:11:59 +09:00
Paul Masurel	7ddc6041a6	Merge branch 'master' into pre-tokenized-text	2019-11-04 16:30:53 +09:00
kkoziara	d87b7f230d	* Removed tokenized flag from TextOptions and code reliance on the flag. * Changed naming to use word "pre-tokenized" instead of "tokenized". * Updated example code. * Fixed comments.	2019-10-28 09:26:24 +01:00
kkoziara	20d2235d4d	* Updated changelog and examples concerning #642 . * Added tokenized_text method to Value implementation. * Implemented From<TokenizedString> for TokenizedStream.	2019-10-26 16:44:57 +02:00
tosterovic	faaecad476	Added handling of pre-tokenized text fields (#642 ).	2019-10-26 16:43:56 +02:00