diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 3bf9e487e..a79ebc371 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -50,7 +50,7 @@ pub trait FastFieldCodecSerializer { } /// FastFieldDataAccess is the trait to access fast field data during serialization and estimation. -pub trait FastFieldDataAccess: Clone { +pub trait FastFieldDataAccess { /// Return the value associated to the given document. /// /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons. @@ -74,6 +74,12 @@ impl<'a> FastFieldDataAccess for &'a [u64] { } } +impl<'a> FastFieldDataAccess for &'a Vec { + fn get(&self, doc: u32) -> u64 { + self[doc as usize] + } +} + impl FastFieldDataAccess for Vec { fn get(&self, doc: u32) -> u64 { self[doc as usize] diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 79e0152b6..3a3f38e06 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -46,7 +46,7 @@ impl Drop for VecWriter { fn drop(&mut self) { if !self.is_flushed { panic!( - "You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", + "You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.", self.path ) } diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 8e917203d..99fb5d13c 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,6 +1,8 @@ use std::ops::Range; -use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength}; +use crate::fastfield::{ + BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength, +}; use crate::DocId; /// Reader for a multivalued `u64` fast field. @@ -13,13 +15,13 @@ use crate::DocId; /// #[derive(Clone)] pub struct MultiValuedFastFieldReader { - idx_reader: BitpackedFastFieldReader, + idx_reader: DynamicFastFieldReader, vals_reader: BitpackedFastFieldReader, } impl MultiValuedFastFieldReader { pub(crate) fn open( - idx_reader: BitpackedFastFieldReader, + idx_reader: DynamicFastFieldReader, vals_reader: BitpackedFastFieldReader, ) -> MultiValuedFastFieldReader { MultiValuedFastFieldReader { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 70942d190..9d798787f 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -159,7 +159,11 @@ impl FastFieldReaderCodecWrapper crate::Result { let mut bytes = file.read_bytes()?; let id = u8::deserialize(&mut bytes)?; - assert_eq!(BitpackedFastFieldSerializer::ID, id); + assert_eq!( + BitpackedFastFieldSerializer::ID, + id, + "Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id" + ); Self::open_from_bytes(bytes) } /// Opens a fast field given the bytes. diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index af8bdf949..24c0d0a6e 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -112,9 +112,8 @@ impl FastFieldReaders { &self, field: Field, ) -> crate::Result> { - let fast_field_slice_idx = self.fast_field_data(field, 0)?; + let idx_reader = self.typed_fast_field_reader(field)?; let fast_field_slice_vals = self.fast_field_data(field, 1)?; - let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?; let vals_reader: BitpackedFastFieldReader = BitpackedFastFieldReader::open(fast_field_slice_vals)?; Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader)) diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 956c23eea..d297c65ff 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -140,6 +140,7 @@ impl CompositeFastFieldSerializer { panic!("unknown fastfield serializer {}", name) } }; + field_write.flush()?; Ok(()) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index e6fcfa573..e6b683f5f 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -514,19 +514,21 @@ impl IndexMerger { // Important: reader_and_field_accessor needs // to have the same order as self.readers since ReaderWithOrdinal // is used to index the reader_and_field_accessors vec. - fn write_1_n_fast_field_idx_generic( + fn write_1_n_fast_field_idx_generic( field: Field, fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, - reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)], + reader_and_field_accessors: &[(&SegmentReader, T)], ) -> crate::Result<()> { let mut total_num_vals = 0u64; // In the first pass, we compute the total number of vals. // // This is required by the bitpacker, as it needs to know // what should be the bit length use for bitpacking. + let mut idx_num_vals = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(delete_bitset) = reader.delete_bitset() { + idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64; for doc in 0u32..reader.max_doc() { if delete_bitset.is_alive(doc) { let num_vals = u64s_reader.get_len(doc) as u64; @@ -534,38 +536,60 @@ impl IndexMerger { } } } else { + idx_num_vals += reader.max_doc() as u64; total_num_vals += u64s_reader.get_total_len(); } } + let stats = FastFieldStats { + max_value: total_num_vals, + num_vals: idx_num_vals, + min_value: 0, + }; // We can now create our `idx` serializer, and in a second pass, // can effectively push the different indexes. if let Some(doc_id_mapping) = doc_id_mapping { - let mut serialize_idx = - fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?; + // copying into a temp vec is not ideal, but the fast field codec api requires random + // access, which is used in the estimation. It's possible to 1. calculate random + // acccess on the fly or 2. change the codec api to make random access optional, but + // they both have also major drawbacks. + let mut offsets = vec![]; let mut offset = 0; for (doc_id, reader) in doc_id_mapping { let reader = &reader_and_field_accessors[reader.ordinal as usize].1; - serialize_idx.add_val(offset)?; + offsets.push(offset); offset += reader.get_len(*doc_id) as u64; } - serialize_idx.add_val(offset as u64)?; + offsets.push(offset); - serialize_idx.close_field()?; + fast_field_serializer.create_auto_detect_u64_fast_field( + field, + stats, + &offsets, + offsets.iter().cloned(), + offsets.iter().cloned(), + )?; } else { - let mut serialize_idx = - fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?; - let mut idx = 0; + let mut offsets = vec![]; + let mut offset = 0; for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() { for doc in segment_reader.doc_ids_alive() { - serialize_idx.add_val(idx)?; - idx += u64s_reader.get_len(doc) as u64; + offsets.push(offset); + offset += u64s_reader.get_len(doc) as u64; } } - serialize_idx.add_val(idx)?; - serialize_idx.close_field()?; + offsets.push(offset); + + fast_field_serializer.create_auto_detect_u64_fast_field( + field, + stats, + &offsets, + offsets.iter().cloned(), + offsets.iter().cloned(), + )?; } + Ok(()) } fn write_multi_value_fast_field_idx(