From 3ecfc36e5384ef0ba3b8342c3731b519a4bdec9d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 21 Mar 2018 19:21:21 +0900 Subject: [PATCH] Total field norm fixed. --- src/fastfield/writer.rs | 10 --- src/fieldnorm/serializer.rs | 2 +- src/fieldnorm/writer.rs | 11 ++- src/indexer/merger.rs | 106 ++++++++++++++++------------ src/indexer/segment_writer.rs | 2 - src/postings/postings_writer.rs | 1 + src/query/term_query/mod.rs | 1 - src/query/term_query/term_scorer.rs | 1 - 8 files changed, 70 insertions(+), 64 deletions(-) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 57e887be0..371041d59 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,7 +1,6 @@ use schema::{Cardinality, Document, Field, Schema}; use fastfield::FastFieldSerializer; use std::io; -use DocId; use schema::FieldType; use common; use common::VInt; @@ -57,15 +56,6 @@ impl FastFieldsWriter { } } - /// Returns a `FastFieldsWriter with a `u64` `IntFastFieldWriter` for each - /// of the field given in argument. - pub(crate) fn new(fields: Vec) -> FastFieldsWriter { - FastFieldsWriter { - single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(), - multi_values_writers: vec![], - } - } - /// Get the `FastFieldWriter` associated to a field. pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> { // TODO optimize diff --git a/src/fieldnorm/serializer.rs b/src/fieldnorm/serializer.rs index 236ec39a9..e0f413ae2 100644 --- a/src/fieldnorm/serializer.rs +++ b/src/fieldnorm/serializer.rs @@ -28,7 +28,7 @@ impl FieldNormsSerializer { Ok(()) } - pub fn close(mut self) -> io::Result<()> { + pub fn close(self) -> io::Result<()> { self.composite_write.close()?; Ok(()) } diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 3816e3ed8..77f7d5e62 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -12,8 +12,9 @@ pub struct FieldNormsWriter { } impl FieldNormsWriter { - pub fn for_schema(schema: &Schema) -> FieldNormsWriter { - let fields = schema + + pub fn fields_with_fieldnorm(schema: &Schema) -> Vec { + schema .fields() .iter() .enumerate() @@ -21,7 +22,11 @@ impl FieldNormsWriter { field_entry.is_indexed() }) .map(|(field, _)| Field(field as u32)) - .collect::>(); + .collect::>() + } + + pub fn for_schema(schema: &Schema) -> FieldNormsWriter { + let fields = FieldNormsWriter::fields_with_fieldnorm(schema); let max_field = fields .iter() .map(|field| field.0) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0efb262df..34e3c019a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -18,6 +18,36 @@ use std::cmp::{max, min}; use termdict::TermDictionary; use termdict::TermStreamer; use fieldnorm::FieldNormsSerializer; +use fieldnorm::FieldNormsWriter; +use fieldnorm::FieldNormReader; + + +fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { + let mut total_tokens = 0u64; + let mut count: [usize; 256] = [0; 256]; + for reader in readers { + if reader.delete_bitset().has_deletes() { + // if there are deletes, then we use an approximation + // using the fieldnorm + if let Some(fieldnorms_reader) = reader.get_fieldnorms_reader(field) { + for doc in 0..reader.max_doc() { + if !reader.is_deleted(doc) { + let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); + count[fieldnorm_id as usize] += 1; + } + } + } + } else { + total_tokens += reader.inverted_index(field).total_num_tokens(); + } + } + total_tokens + count + .iter() + .cloned() + .enumerate() + .map(|(fieldnorm_ord, count)| count as u64 * FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8) as u64) + .sum::() +} pub struct IndexMerger { schema: Schema, @@ -47,14 +77,6 @@ fn compute_min_max_val( } } - -fn extract_fast_field_reader( - segment_reader: &SegmentReader, - field: Field, -) -> Option> { - segment_reader.fast_field_reader(field).ok() -} - struct DeltaComputer { buffer: Vec, } @@ -97,20 +119,26 @@ impl IndexMerger { }) } - fn write_fieldnorms(&self, fast_field_serializer: &mut FieldNormsSerializer) -> Result<()> { - unimplemented!("Not implemented yet"); -// let fieldnorm_fastfields: Vec = self.schema -// .fields() -// .iter() -// .enumerate() -// .filter(|&(_, field_entry)| field_entry.is_indexed()) -// .map(|(field_id, _)| Field(field_id as u32)) -// .collect(); -// self.generic_write_fast_field( -// fieldnorm_fastfields, -// &extract_fieldnorm_reader, -// fast_field_serializer, -// ) + fn write_fieldnorms(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> Result<()> { + let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); + let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); + for field in fields { + fieldnorms_data.clear(); + for reader in &self.readers { + let fieldnorms_reader_opt = reader.get_fieldnorms_reader(field); + for doc_id in 0..reader.max_doc() { + if !reader.is_deleted(doc_id) { + let fieldnorm_id = fieldnorms_reader_opt + .as_ref() + .map(|reader| reader.fieldnorm_id(doc_id)) + .unwrap_or(0u8); + fieldnorms_data.push(fieldnorm_id); + } + } + } + fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; + } + Ok(()) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { @@ -121,28 +149,15 @@ impl IndexMerger { .filter(|&(_, field_entry)| field_entry.is_int_fast()) .map(|(field_id, _)| Field(field_id as u32)) .collect(); - self.generic_write_fast_field( - fast_fields, - &extract_fast_field_reader, - fast_field_serializer, - ) - } - // used both to merge field norms and regular u64 fast fields. - fn generic_write_fast_field( - &self, - fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) -> Option>, - fast_field_serializer: &mut FastFieldSerializer, - ) -> Result<()> { - for field in fields { + for field in fast_fields { let mut u64_readers = vec![]; let mut min_val = u64::max_value(); let mut max_val = u64::min_value(); for reader in &self.readers { - match field_reader_extractor(reader, field) { - Some(u64_reader) => { + match reader.fast_field_reader::(field) { + Ok(u64_reader) => { if let Some((seg_min_val, seg_max_val)) = compute_min_max_val( &u64_reader, reader.max_doc(), @@ -158,9 +173,10 @@ impl IndexMerger { )); } } - None => { + Err(_) => { + let fieldname = self.schema.get_field_name(field); let error_msg = - format!("Failed to find a u64_reader for field {:?}", field); + format!("Failed to find a fast field reader for field {:?}", fieldname); error!("{}", error_msg); bail!(ErrorKind::SchemaError(error_msg)); } @@ -232,12 +248,10 @@ impl IndexMerger { merged_doc_id_map.push(segment_local_map); } - - let total_num_tokens = self.readers - .iter() - .map(|reader| reader.inverted_index(indexed_field)) - .map(|inverted_index| inverted_index.total_num_tokens()) - .sum(); + // The total number of tokens will only be exact when there has been no deletes. + // + // Otherwise, we approximate by removing deleted documents proportionally. + let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field); // Create the total list of doc ids // by stacking the doc ids from the different segment. diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 4e6cfed73..4436bf09d 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -7,10 +7,8 @@ use schema::Term; use core::Segment; use core::SerializableSegment; use fastfield::FastFieldsWriter; -use schema::Field; use schema::FieldType; use indexer::segment_serializer::SegmentSerializer; -use std::collections::HashMap; use datastruct::stacker::Heap; use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::AddOperation; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index f7571dd08..698c2c795 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -261,6 +261,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' fn add_num_tokens(&mut self, num_tokens: u32) { self.total_num_tokens += num_tokens as u64; } + fn total_num_tokens(&self) -> u64 { self.total_num_tokens } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 34ff31c19..3f7fb6a2a 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -18,7 +18,6 @@ mod tests { use schema::*; use fieldnorm::FieldNormReader; use schema::IndexRecordOption; - use fastfield::FastFieldReader; fn abs_diff(left: f32, right: f32) -> f32 { (right - left).abs() diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index a4ffaa411..2cea41623 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -4,7 +4,6 @@ use docset::{DocSet, SkipResult}; use postings::SegmentPostings; use query::Scorer; use postings::Postings; -use fastfield::FastFieldReader; use fieldnorm::FieldNormReader; pub struct TermScorer {