diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index 0c6d6424a..5cf309ef1 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -65,8 +65,7 @@ fn main() -> tantivy::Result<()> { tokens: pre_tokenize_text(body_text), }; - // Now lets create a document and add our `PreTokenizedString` using - // `add_pre_tokenized_text` method of `Document` + // Now lets create a document and add our `PreTokenizedString` let old_man_doc = doc!(title => title_tok, body => body_tok); // ... now let's just add it to the IndexWriter @@ -114,6 +113,9 @@ fn main() -> tantivy::Result<()> { assert_eq!(count, 2); + // Now let's print out the results. + // Note that the tokens are not stored along with the original text + // in the document store for (_score, doc_address) in top_docs { let retrieved_doc = searcher.doc(doc_address)?; println!("Document: {}", schema.to_json(&retrieved_doc)); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 5cfdccdcd..8ed1025ba 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -249,6 +249,7 @@ impl SegmentWriter { } } doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); + doc.prepare_for_store(); let doc_writer = self.segment_serializer.get_store_writer(); doc_writer.store(&doc)?; self.max_doc += 1; diff --git a/src/schema/document.rs b/src/schema/document.rs index 6cab58bfd..4ec5e3549 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -155,6 +155,21 @@ impl Document { .find(|field_value| field_value.field() == field) .map(FieldValue::value) } + + /// Prepares Document for being stored in the document store + /// + /// Method transforms PreTokenizedString values into String + /// values. + pub fn prepare_for_store(&mut self) { + for field_value in &mut self.field_values { + if let Value::PreTokStr(pre_tokenized_text) = field_value.value() { + *field_value = FieldValue::new( + field_value.field(), + Value::Str(pre_tokenized_text.text.clone()), //< TODO somehow remove .clone() + ); + } + } + } } impl BinarySerializable for Document { @@ -180,6 +195,7 @@ impl BinarySerializable for Document { mod tests { use crate::schema::*; + use crate::tokenizer::{PreTokenizedString, Token}; #[test] fn test_doc() { @@ -189,4 +205,38 @@ mod tests { doc.add_text(text_field, "My title"); assert_eq!(doc.field_values().len(), 1); } + + #[test] + fn test_prepare_for_store() { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("title", TEXT); + let mut doc = Document::default(); + + let pre_tokenized_text = PreTokenizedString { + text: String::from("A"), + tokens: vec![Token { + offset_from: 0, + offset_to: 1, + position: 0, + text: String::from("A"), + position_length: 1, + }], + }; + + doc.add_pre_tokenized_text(text_field, &pre_tokenized_text); + doc.add_text(text_field, "title"); + doc.prepare_for_store(); + + assert_eq!(doc.field_values().len(), 2); + + match doc.field_values()[0].value() { + Value::Str(ref text) => assert_eq!(text, "A"), + _ => panic!("Incorrect variant of Value"), + } + + match doc.field_values()[1].value() { + Value::Str(ref text) => assert_eq!(text, "title"), + _ => panic!("Incorrect variant of Value"), + } + } }