From afe0134d0f8f18817c910f7acc440973b202b5ba Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 25 Nov 2019 22:39:12 +0900 Subject: [PATCH] Kkoziara remove tokens from doc store (#715) * Prevent tokens from being stored in the document store. Commit adds prepare_for_store method to Document, which changes all PreTokenizedString values into String values. The method is called before adding document to the document store to prevent tokens from being saved there. Commit also adds small changes to comments in pre_tokenized_text example. * Avoid storing the pretokenized text. --- examples/pre_tokenized_text.rs | 6 ++-- src/indexer/segment_writer.rs | 1 + src/schema/document.rs | 50 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index 0c6d6424a..5cf309ef1 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -65,8 +65,7 @@ fn main() -> tantivy::Result<()> { tokens: pre_tokenize_text(body_text), }; - // Now lets create a document and add our `PreTokenizedString` using - // `add_pre_tokenized_text` method of `Document` + // Now lets create a document and add our `PreTokenizedString` let old_man_doc = doc!(title => title_tok, body => body_tok); // ... now let's just add it to the IndexWriter @@ -114,6 +113,9 @@ fn main() -> tantivy::Result<()> { assert_eq!(count, 2); + // Now let's print out the results. + // Note that the tokens are not stored along with the original text + // in the document store for (_score, doc_address) in top_docs { let retrieved_doc = searcher.doc(doc_address)?; println!("Document: {}", schema.to_json(&retrieved_doc)); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 5cfdccdcd..8ed1025ba 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -249,6 +249,7 @@ impl SegmentWriter { } } doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); + doc.prepare_for_store(); let doc_writer = self.segment_serializer.get_store_writer(); doc_writer.store(&doc)?; self.max_doc += 1; diff --git a/src/schema/document.rs b/src/schema/document.rs index 6cab58bfd..4ec5e3549 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -155,6 +155,21 @@ impl Document { .find(|field_value| field_value.field() == field) .map(FieldValue::value) } + + /// Prepares Document for being stored in the document store + /// + /// Method transforms PreTokenizedString values into String + /// values. + pub fn prepare_for_store(&mut self) { + for field_value in &mut self.field_values { + if let Value::PreTokStr(pre_tokenized_text) = field_value.value() { + *field_value = FieldValue::new( + field_value.field(), + Value::Str(pre_tokenized_text.text.clone()), //< TODO somehow remove .clone() + ); + } + } + } } impl BinarySerializable for Document { @@ -180,6 +195,7 @@ impl BinarySerializable for Document { mod tests { use crate::schema::*; + use crate::tokenizer::{PreTokenizedString, Token}; #[test] fn test_doc() { @@ -189,4 +205,38 @@ mod tests { doc.add_text(text_field, "My title"); assert_eq!(doc.field_values().len(), 1); } + + #[test] + fn test_prepare_for_store() { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("title", TEXT); + let mut doc = Document::default(); + + let pre_tokenized_text = PreTokenizedString { + text: String::from("A"), + tokens: vec![Token { + offset_from: 0, + offset_to: 1, + position: 0, + text: String::from("A"), + position_length: 1, + }], + }; + + doc.add_pre_tokenized_text(text_field, &pre_tokenized_text); + doc.add_text(text_field, "title"); + doc.prepare_for_store(); + + assert_eq!(doc.field_values().len(), 2); + + match doc.field_values()[0].value() { + Value::Str(ref text) => assert_eq!(text, "A"), + _ => panic!("Incorrect variant of Value"), + } + + match doc.field_values()[1].value() { + Value::Str(ref text) => assert_eq!(text, "title"), + _ => panic!("Incorrect variant of Value"), + } + } }