mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
use dynamic fast field codec for offset index
This commit is contained in:
@@ -50,7 +50,7 @@ pub trait FastFieldCodecSerializer {
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess: Clone {
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
@@ -74,6 +74,12 @@ impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
|
||||
@@ -46,7 +46,7 @@ impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::fastfield::{
|
||||
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
|
||||
};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -13,13 +15,13 @@ use crate::DocId;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
|
||||
@@ -159,7 +159,11 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(BitpackedFastFieldSerializer::ID, id);
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
|
||||
);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
|
||||
@@ -112,9 +112,8 @@ impl FastFieldReaders {
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = self.typed_fast_field_reader(field)?;
|
||||
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: BitpackedFastFieldReader<TFastValue> =
|
||||
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
|
||||
@@ -140,6 +140,7 @@ impl CompositeFastFieldSerializer {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -514,19 +514,21 @@ impl IndexMerger {
|
||||
// Important: reader_and_field_accessor needs
|
||||
// to have the same order as self.readers since ReaderWithOrdinal
|
||||
// is used to index the reader_and_field_accessors vec.
|
||||
fn write_1_n_fast_field_idx_generic(
|
||||
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
|
||||
field: Field,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
|
||||
reader_and_field_accessors: &[(&SegmentReader, T)],
|
||||
) -> crate::Result<()> {
|
||||
let mut total_num_vals = 0u64;
|
||||
// In the first pass, we compute the total number of vals.
|
||||
//
|
||||
// This is required by the bitpacker, as it needs to know
|
||||
// what should be the bit length use for bitpacking.
|
||||
let mut idx_num_vals = 0;
|
||||
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = u64s_reader.get_len(doc) as u64;
|
||||
@@ -534,38 +536,60 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
idx_num_vals += reader.max_doc() as u64;
|
||||
total_num_vals += u64s_reader.get_total_len();
|
||||
}
|
||||
}
|
||||
|
||||
let stats = FastFieldStats {
|
||||
max_value: total_num_vals,
|
||||
num_vals: idx_num_vals,
|
||||
min_value: 0,
|
||||
};
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
// can effectively push the different indexes.
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
// copying into a temp vec is not ideal, but the fast field codec api requires random
|
||||
// access, which is used in the estimation. It's possible to 1. calculate random
|
||||
// acccess on the fly or 2. change the codec api to make random access optional, but
|
||||
// they both have also major drawbacks.
|
||||
|
||||
let mut offsets = vec![];
|
||||
let mut offset = 0;
|
||||
for (doc_id, reader) in doc_id_mapping {
|
||||
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
|
||||
serialize_idx.add_val(offset)?;
|
||||
offsets.push(offset);
|
||||
offset += reader.get_len(*doc_id) as u64;
|
||||
}
|
||||
serialize_idx.add_val(offset as u64)?;
|
||||
offsets.push(offset);
|
||||
|
||||
serialize_idx.close_field()?;
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
&offsets,
|
||||
offsets.iter().cloned(),
|
||||
offsets.iter().cloned(),
|
||||
)?;
|
||||
} else {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
let mut offsets = vec![];
|
||||
let mut offset = 0;
|
||||
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
idx += u64s_reader.get_len(doc) as u64;
|
||||
offsets.push(offset);
|
||||
offset += u64s_reader.get_len(doc) as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
serialize_idx.close_field()?;
|
||||
offsets.push(offset);
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
&offsets,
|
||||
offsets.iter().cloned(),
|
||||
offsets.iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_multi_value_fast_field_idx(
|
||||
|
||||
Reference in New Issue
Block a user