Total field norm fixed.

This commit is contained in:
Paul Masurel
2018-03-21 19:21:21 +09:00
parent 1c9450174e
commit 3ecfc36e53
8 changed files with 70 additions and 64 deletions

View File

@@ -1,7 +1,6 @@
use schema::{Cardinality, Document, Field, Schema};
use fastfield::FastFieldSerializer;
use std::io;
use DocId;
use schema::FieldType;
use common;
use common::VInt;
@@ -57,15 +56,6 @@ impl FastFieldsWriter {
}
}
/// Returns a `FastFieldsWriter with a `u64` `IntFastFieldWriter` for each
/// of the field given in argument.
pub(crate) fn new(fields: Vec<Field>) -> FastFieldsWriter {
FastFieldsWriter {
single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
multi_values_writers: vec![],
}
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize

View File

@@ -28,7 +28,7 @@ impl FieldNormsSerializer {
Ok(())
}
pub fn close(mut self) -> io::Result<()> {
pub fn close(self) -> io::Result<()> {
self.composite_write.close()?;
Ok(())
}

View File

@@ -12,8 +12,9 @@ pub struct FieldNormsWriter {
}
impl FieldNormsWriter {
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = schema
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
schema
.fields()
.iter()
.enumerate()
@@ -21,7 +22,11 @@ impl FieldNormsWriter {
field_entry.is_indexed()
})
.map(|(field, _)| Field(field as u32))
.collect::<Vec<_>>();
.collect::<Vec<Field>>()
}
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
.iter()
.map(|field| field.0)

View File

@@ -18,6 +18,36 @@ use std::cmp::{max, min};
use termdict::TermDictionary;
use termdict::TermStreamer;
use fieldnorm::FieldNormsSerializer;
use fieldnorm::FieldNormsWriter;
use fieldnorm::FieldNormReader;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
if reader.delete_bitset().has_deletes() {
// if there are deletes, then we use an approximation
// using the fieldnorm
if let Some(fieldnorms_reader) = reader.get_fieldnorms_reader(field) {
for doc in 0..reader.max_doc() {
if !reader.is_deleted(doc) {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
}
}
} else {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens + count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| count as u64 * FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8) as u64)
.sum::<u64>()
}
pub struct IndexMerger {
schema: Schema,
@@ -47,14 +77,6 @@ fn compute_min_max_val(
}
}
fn extract_fast_field_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<FastFieldReader<u64>> {
segment_reader.fast_field_reader(field).ok()
}
struct DeltaComputer {
buffer: Vec<u32>,
}
@@ -97,20 +119,26 @@ impl IndexMerger {
})
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FieldNormsSerializer) -> Result<()> {
unimplemented!("Not implemented yet");
// let fieldnorm_fastfields: Vec<Field> = self.schema
// .fields()
// .iter()
// .enumerate()
// .filter(|&(_, field_entry)| field_entry.is_indexed())
// .map(|(field_id, _)| Field(field_id as u32))
// .collect();
// self.generic_write_fast_field(
// fieldnorm_fastfields,
// &extract_fieldnorm_reader,
// fast_field_serializer,
// )
fn write_fieldnorms(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
for field in fields {
fieldnorms_data.clear();
for reader in &self.readers {
let fieldnorms_reader_opt = reader.get_fieldnorms_reader(field);
for doc_id in 0..reader.max_doc() {
if !reader.is_deleted(doc_id) {
let fieldnorm_id = fieldnorms_reader_opt
.as_ref()
.map(|reader| reader.fieldnorm_id(doc_id))
.unwrap_or(0u8);
fieldnorms_data.push(fieldnorm_id);
}
}
}
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
}
Ok(())
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -121,28 +149,15 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<FastFieldReader<u64>>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
for field in fast_fields {
let mut u64_readers = vec![];
let mut min_val = u64::max_value();
let mut max_val = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
match reader.fast_field_reader::<u64>(field) {
Ok(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
&u64_reader,
reader.max_doc(),
@@ -158,9 +173,10 @@ impl IndexMerger {
));
}
}
None => {
Err(_) => {
let fieldname = self.schema.get_field_name(field);
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
format!("Failed to find a fast field reader for field {:?}", fieldname);
error!("{}", error_msg);
bail!(ErrorKind::SchemaError(error_msg));
}
@@ -232,12 +248,10 @@ impl IndexMerger {
merged_doc_id_map.push(segment_local_map);
}
let total_num_tokens = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.map(|inverted_index| inverted_index.total_num_tokens())
.sum();
// The total number of tokens will only be exact when there has been no deletes.
//
// Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field);
// Create the total list of doc ids
// by stacking the doc ids from the different segment.

View File

@@ -7,10 +7,8 @@ use schema::Term;
use core::Segment;
use core::SerializableSegment;
use fastfield::FastFieldsWriter;
use schema::Field;
use schema::FieldType;
use indexer::segment_serializer::SegmentSerializer;
use std::collections::HashMap;
use datastruct::stacker::Heap;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;

View File

@@ -261,6 +261,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
fn add_num_tokens(&mut self, num_tokens: u32) {
self.total_num_tokens += num_tokens as u64;
}
fn total_num_tokens(&self) -> u64 {
self.total_num_tokens
}

View File

@@ -18,7 +18,6 @@ mod tests {
use schema::*;
use fieldnorm::FieldNormReader;
use schema::IndexRecordOption;
use fastfield::FastFieldReader;
fn abs_diff(left: f32, right: f32) -> f32 {
(right - left).abs()

View File

@@ -4,7 +4,6 @@ use docset::{DocSet, SkipResult};
use postings::SegmentPostings;
use query::Scorer;
use postings::Postings;
use fastfield::FastFieldReader;
use fieldnorm::FieldNormReader;
pub struct TermScorer {