Kkoziara remove tokens from doc store (#715)

* Prevent tokens from being stored in the document store.

Commit adds prepare_for_store method to Document, which changes all
PreTokenizedString values into String values. The method is called
before adding document to the document store to prevent tokens from
being saved there. Commit also adds small changes to comments in
pre_tokenized_text example.

* Avoid storing the pretokenized text.
This commit is contained in:
Paul Masurel
2019-11-25 22:39:12 +09:00
committed by GitHub
parent db9e81d0f9
commit afe0134d0f
3 changed files with 55 additions and 2 deletions

View File

@@ -65,8 +65,7 @@ fn main() -> tantivy::Result<()> {
tokens: pre_tokenize_text(body_text),
};
// Now lets create a document and add our `PreTokenizedString` using
// `add_pre_tokenized_text` method of `Document`
// Now lets create a document and add our `PreTokenizedString`
let old_man_doc = doc!(title => title_tok, body => body_tok);
// ... now let's just add it to the IndexWriter
@@ -114,6 +113,9 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 2);
// Now let's print out the results.
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));

View File

@@ -249,6 +249,7 @@ impl SegmentWriter {
}
}
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
doc.prepare_for_store();
let doc_writer = self.segment_serializer.get_store_writer();
doc_writer.store(&doc)?;
self.max_doc += 1;

View File

@@ -155,6 +155,21 @@ impl Document {
.find(|field_value| field_value.field() == field)
.map(FieldValue::value)
}
/// Prepares Document for being stored in the document store
///
/// Method transforms PreTokenizedString values into String
/// values.
pub fn prepare_for_store(&mut self) {
for field_value in &mut self.field_values {
if let Value::PreTokStr(pre_tokenized_text) = field_value.value() {
*field_value = FieldValue::new(
field_value.field(),
Value::Str(pre_tokenized_text.text.clone()), //< TODO somehow remove .clone()
);
}
}
}
}
impl BinarySerializable for Document {
@@ -180,6 +195,7 @@ impl BinarySerializable for Document {
mod tests {
use crate::schema::*;
use crate::tokenizer::{PreTokenizedString, Token};
#[test]
fn test_doc() {
@@ -189,4 +205,38 @@ mod tests {
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
}
#[test]
fn test_prepare_for_store() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = Document::default();
let pre_tokenized_text = PreTokenizedString {
text: String::from("A"),
tokens: vec![Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: String::from("A"),
position_length: 1,
}],
};
doc.add_pre_tokenized_text(text_field, &pre_tokenized_text);
doc.add_text(text_field, "title");
doc.prepare_for_store();
assert_eq!(doc.field_values().len(), 2);
match doc.field_values()[0].value() {
Value::Str(ref text) => assert_eq!(text, "A"),
_ => panic!("Incorrect variant of Value"),
}
match doc.field_values()[1].value() {
Value::Str(ref text) => assert_eq!(text, "title"),
_ => panic!("Incorrect variant of Value"),
}
}
}