use dynamic fast field codec for offset index

This commit is contained in:
Pascal Seitz
2021-06-15 13:34:42 +02:00
parent b5cc60f80b
commit cd169dee23
7 changed files with 58 additions and 22 deletions

View File

@@ -50,7 +50,7 @@ pub trait FastFieldCodecSerializer {
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess: Clone {
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
@@ -74,6 +74,12 @@ impl<'a> FastFieldDataAccess for &'a [u64] {
}
}
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]

View File

@@ -46,7 +46,7 @@ impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
self.path
)
}

View File

@@ -1,6 +1,8 @@
use std::ops::Range;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -13,13 +15,13 @@ use crate::DocId;
///
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: BitpackedFastFieldReader<u64>,
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: BitpackedFastFieldReader<u64>,
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {

View File

@@ -159,7 +159,11 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
let id = u8::deserialize(&mut bytes)?;
assert_eq!(BitpackedFastFieldSerializer::ID, id);
assert_eq!(
BitpackedFastFieldSerializer::ID,
id,
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
);
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.

View File

@@ -112,9 +112,8 @@ impl FastFieldReaders {
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))

View File

@@ -140,6 +140,7 @@ impl CompositeFastFieldSerializer {
panic!("unknown fastfield serializer {}", name)
}
};
field_write.flush()?;
Ok(())
}

View File

@@ -514,19 +514,21 @@ impl IndexMerger {
// Important: reader_and_field_accessor needs
// to have the same order as self.readers since ReaderWithOrdinal
// is used to index the reader_and_field_accessors vec.
fn write_1_n_fast_field_idx_generic(
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
reader_and_field_accessors: &[(&SegmentReader, T)],
) -> crate::Result<()> {
let mut total_num_vals = 0u64;
// In the first pass, we compute the total number of vals.
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
let mut idx_num_vals = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(delete_bitset) = reader.delete_bitset() {
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = u64s_reader.get_len(doc) as u64;
@@ -534,38 +536,60 @@ impl IndexMerger {
}
}
} else {
idx_num_vals += reader.max_doc() as u64;
total_num_vals += u64s_reader.get_total_len();
}
}
let stats = FastFieldStats {
max_value: total_num_vals,
num_vals: idx_num_vals,
min_value: 0,
};
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
if let Some(doc_id_mapping) = doc_id_mapping {
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
serialize_idx.add_val(offset)?;
offsets.push(offset);
offset += reader.get_len(*doc_id) as u64;
}
serialize_idx.add_val(offset as u64)?;
offsets.push(offset);
serialize_idx.close_field()?;
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
} else {
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
let mut offsets = vec![];
let mut offset = 0;
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
for doc in segment_reader.doc_ids_alive() {
serialize_idx.add_val(idx)?;
idx += u64s_reader.get_len(doc) as u64;
offsets.push(offset);
offset += u64s_reader.get_len(doc) as u64;
}
}
serialize_idx.add_val(idx)?;
serialize_idx.close_field()?;
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
}
Ok(())
}
fn write_multi_value_fast_field_idx(