diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index a7a35092f..492c34dc7 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -240,6 +240,7 @@ where // and we do not have any specialized implementation anyway. } +/// Wraps anything that returns an iterator into a `Column`. pub struct IterColumn(T); impl From for IterColumn diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 552ae7057..ef8bd6e0a 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -41,7 +41,7 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; -pub use self::column::{monotonic_map_column, Column, VecColumn}; +pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn}; use self::linear::LinearCodec; pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index c4a4f2ffc..3b217b926 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,10 +1,9 @@ -use std::ops::Range; use std::sync::Arc; use fastfield_codecs::Column; use crate::directory::{FileSlice, OwnedBytes}; -use crate::fastfield::MultiValueLength; +use crate::fastfield::MultiValueIndex; use crate::DocId; /// Reader for byte array fast fields @@ -19,7 +18,7 @@ use crate::DocId; /// and the start index for the next document, and keeping the bytes in between. #[derive(Clone)] pub struct BytesFastFieldReader { - idx_reader: Arc>, + idx_reader: MultiValueIndex, values: OwnedBytes, } @@ -29,24 +28,26 @@ impl BytesFastFieldReader { values_file: FileSlice, ) -> crate::Result { let values = values_file.read_bytes()?; - Ok(BytesFastFieldReader { idx_reader, values }) + Ok(BytesFastFieldReader { + idx_reader: MultiValueIndex::new(idx_reader), + values, + }) } - fn range(&self, doc: DocId) -> Range { - let start = self.idx_reader.get_val(doc) as u32; - let end = self.idx_reader.get_val(doc + 1) as u32; - start..end + /// returns the multivalue index + pub fn get_index_reader(&self) -> &MultiValueIndex { + &self.idx_reader } /// Returns the bytes associated with the given `doc` pub fn get_bytes(&self, doc: DocId) -> &[u8] { - let range = self.range(doc); + let range = self.idx_reader.range(doc); &self.values.as_slice()[range.start as usize..range.end as usize] } /// Returns the length of the bytes associated with the given `doc` pub fn num_bytes(&self, doc: DocId) -> u64 { - let range = self.range(doc); + let range = self.idx_reader.range(doc); (range.end - range.start) as u64 } @@ -55,15 +56,3 @@ impl BytesFastFieldReader { self.values.len() as u64 } } - -impl MultiValueLength for BytesFastFieldReader { - fn get_range(&self, doc_id: DocId) -> std::ops::Range { - self.range(doc_id) - } - fn get_len(&self, doc_id: DocId) -> u64 { - self.num_bytes(doc_id) - } - fn get_total_len(&self) -> u64 { - self.total_num_bytes() - } -} diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 5bd90f6ce..a29ee238c 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -27,8 +27,8 @@ pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex}; pub use self::multivalued::{ - MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter, - MultiValuedU128FastFieldReader, + MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, + MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader, }; pub use self::readers::FastFieldReaders; pub(crate) use self::readers::{type_and_cardinality, FastType}; @@ -36,7 +36,7 @@ pub use self::serializer::{Column, CompositeFastFieldSerializer}; use self::writer::unexpected_value; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; use crate::schema::{Type, Value}; -use crate::{DateTime, DocId}; +use crate::DateTime; mod alive_bitset; mod bytes; @@ -47,17 +47,6 @@ mod readers; mod serializer; mod writer; -/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data -/// for a doc_id -pub trait MultiValueLength { - /// returns the positions for a docid - fn get_range(&self, doc_id: DocId) -> std::ops::Range; - /// returns the num of values associated with a doc_id - fn get_len(&self, doc_id: DocId) -> u64; - /// returns the sum of num values for all doc_ids - fn get_total_len(&self) -> u64; -} - /// Trait for types that are allowed for fast fields: /// (u64, i64 and f64, bool, DateTime). pub trait FastValue: diff --git a/src/fastfield/multivalued/index.rs b/src/fastfield/multivalued/index.rs new file mode 100644 index 000000000..6f77da34a --- /dev/null +++ b/src/fastfield/multivalued/index.rs @@ -0,0 +1,110 @@ +use std::ops::Range; +use std::sync::Arc; + +use fastfield_codecs::Column; + +use crate::DocId; + +#[derive(Clone)] +/// Index to resolve value range for given doc_id. +/// Starts at 0. +pub struct MultiValueIndex { + idx: Arc>, +} + +impl MultiValueIndex { + pub(crate) fn new(idx: Arc>) -> Self { + Self { idx } + } + + /// Returns `[start, end)`, such that the values associated with + /// the given document are `start..end`. + #[inline] + pub(crate) fn range(&self, doc: DocId) -> Range { + let start = self.idx.get_val(doc) as u32; + let end = self.idx.get_val(doc + 1) as u32; + start..end + } + + /// returns the num of values associated with a doc_id + pub(crate) fn num_vals_for_doc(&self, doc: DocId) -> u32 { + let range = self.range(doc); + range.end - range.start + } + + /// Returns the overall number of values in this field. + #[inline] + pub fn total_num_vals(&self) -> u64 { + self.idx.max_value() + } + + /// Returns the number of documents in the index. + #[inline] + pub fn num_docs(&self) -> u32 { + self.idx.num_vals() - 1 + } + + /// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds. + /// + /// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the + /// index. + /// + /// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically + /// increasing positions. + /// + /// TODO: Instead of a linear scan we can employ a exponential search into binary search to + /// match a docid to its value position. + pub(crate) fn positions_to_docids(&self, docid_start: u32, positions: &[u32]) -> Vec { + let mut docs = vec![]; + let mut cur_doc = docid_start; + let mut last_doc = None; + + for pos in positions { + loop { + let end = self.idx.get_val(cur_doc + 1) as u32; + if end > *pos { + // avoid duplicates + if Some(cur_doc) == last_doc { + break; + } + docs.push(cur_doc); + last_doc = Some(cur_doc); + break; + } + cur_doc += 1; + } + } + + docs + } +} + +#[cfg(test)] +mod tests { + + use std::sync::Arc; + + use fastfield_codecs::IterColumn; + + use crate::fastfield::MultiValueIndex; + + #[test] + fn test_positions_to_docid() { + let offsets = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.] + let column = IterColumn::from(offsets.into_iter()); + let index = MultiValueIndex::new(Arc::new(column)); + assert_eq!(index.num_docs(), 5); + { + let positions = vec![10u32, 11, 15, 20, 21, 22]; + + assert_eq!(index.positions_to_docids(0, &positions), vec![1, 3, 4]); + assert_eq!(index.positions_to_docids(1, &positions), vec![1, 3, 4]); + assert_eq!(index.positions_to_docids(0, &[9]), vec![0]); + assert_eq!(index.positions_to_docids(1, &[10]), vec![1]); + assert_eq!(index.positions_to_docids(1, &[11]), vec![1]); + assert_eq!(index.positions_to_docids(2, &[12]), vec![2]); + assert_eq!(index.positions_to_docids(2, &[12, 14]), vec![2]); + assert_eq!(index.positions_to_docids(2, &[12, 14, 15]), vec![2, 3]); + } + } +} diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 0437ef491..e4e6ede70 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -1,7 +1,9 @@ +mod index; mod reader; mod writer; use fastfield_codecs::FastFieldCodecType; +pub use index::MultiValueIndex; pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader}; pub(crate) use self::writer::MultivalueStartIndex; diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 2894bee38..7d2eb1cf5 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -3,7 +3,8 @@ use std::sync::Arc; use fastfield_codecs::{Column, MonotonicallyMappableToU128}; -use crate::fastfield::{FastValue, MultiValueLength}; +use super::MultiValueIndex; +use crate::fastfield::FastValue; use crate::DocId; /// Reader for a multivalued `u64` fast field. @@ -15,7 +16,7 @@ use crate::DocId; /// The `idx_reader` associated, for each document, the index of its first value. #[derive(Clone)] pub struct MultiValuedFastFieldReader { - idx_reader: Arc>, + idx_reader: MultiValueIndex, vals_reader: Arc>, } @@ -25,20 +26,11 @@ impl MultiValuedFastFieldReader { vals_reader: Arc>, ) -> MultiValuedFastFieldReader { MultiValuedFastFieldReader { - idx_reader, + idx_reader: MultiValueIndex::new(idx_reader), vals_reader, } } - /// Returns `[start, end)`, such that the values associated with - /// the given document are `start..end`. - #[inline] - fn range(&self, doc: DocId) -> Range { - let start = self.idx_reader.get_val(doc) as u32; - let end = self.idx_reader.get_val(doc + 1) as u32; - start..end - } - /// Returns the array of values associated with the given `doc`. #[inline] fn get_vals_for_range(&self, range: Range, vals: &mut Vec) { @@ -51,10 +43,15 @@ impl MultiValuedFastFieldReader { /// Returns the array of values associated with the given `doc`. #[inline] pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { - let range = self.range(doc); + let range = self.idx_reader.range(doc); self.get_vals_for_range(range, vals); } + /// returns the multivalue index + pub fn get_index_reader(&self) -> &MultiValueIndex { + &self.idx_reader + } + /// Returns the minimum value for this fast field. /// /// The min value does not take in account of possible @@ -75,28 +72,14 @@ impl MultiValuedFastFieldReader { /// Returns the number of values associated with the document `DocId`. #[inline] - pub fn num_vals(&self, doc: DocId) -> usize { - let range = self.range(doc); - (range.end - range.start) as usize + pub fn num_vals(&self, doc: DocId) -> u32 { + self.idx_reader.num_vals_for_doc(doc) } - /// Returns the overall number of values in this field . + /// Returns the overall number of values in this field. #[inline] pub fn total_num_vals(&self) -> u64 { - self.idx_reader.max_value() - } -} - -impl MultiValueLength for MultiValuedFastFieldReader { - fn get_range(&self, doc_id: DocId) -> Range { - self.range(doc_id) - } - fn get_len(&self, doc_id: DocId) -> u64 { - self.num_vals(doc_id) as u64 - } - - fn get_total_len(&self) -> u64 { - self.total_num_vals() as u64 + self.idx_reader.total_num_vals() } } @@ -109,7 +92,7 @@ impl MultiValueLength for MultiValuedFastFieldReader { /// The `idx_reader` associated, for each document, the index of its first value. #[derive(Clone)] pub struct MultiValuedU128FastFieldReader { - idx_reader: Arc>, + idx_reader: MultiValueIndex, vals_reader: Arc>, } @@ -119,24 +102,15 @@ impl MultiValuedU128FastFieldReader { vals_reader: Arc>, ) -> MultiValuedU128FastFieldReader { Self { - idx_reader, + idx_reader: MultiValueIndex::new(idx_reader), vals_reader, } } - /// Returns `[start, end)`, such that the values associated - /// to the given document are `start..end`. - #[inline] - fn range(&self, doc: DocId) -> Range { - let start = self.idx_reader.get_val(doc) as u32; - let end = self.idx_reader.get_val(doc + 1) as u32; - start..end - } - /// Returns the array of values associated to the given `doc`. #[inline] pub fn get_first_val(&self, doc: DocId) -> Option { - let range = self.range(doc); + let range = self.idx_reader.range(doc); if range.is_empty() { return None; } @@ -152,10 +126,15 @@ impl MultiValuedU128FastFieldReader { .get_range(range.start as u64, &mut vals[..]); } + /// Returns the index reader + pub fn get_index_reader(&self) -> &MultiValueIndex { + &self.idx_reader + } + /// Returns the array of values associated to the given `doc`. #[inline] pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { - let range = self.range(doc); + let range = self.idx_reader.range(doc); self.get_vals_for_range(range, vals); } @@ -165,11 +144,11 @@ impl MultiValuedU128FastFieldReader { value_range: RangeInclusive, doc_id_range: Range, ) -> Vec { - let mut positions = Vec::new(); // TODO replace + let mut positions = Vec::new(); self.vals_reader .get_positions_for_value_range(value_range, doc_id_range, &mut positions); - positions_to_docids(&positions, self.idx_reader.as_ref()) + self.idx_reader.positions_to_docids(0, &positions) } /// Iterates over all elements in the fast field @@ -197,85 +176,23 @@ impl MultiValuedU128FastFieldReader { /// Returns the number of values associated with the document `DocId`. #[inline] - pub fn num_vals(&self, doc: DocId) -> usize { - let range = self.range(doc); - (range.end - range.start) as usize + pub fn num_vals(&self, doc: DocId) -> u32 { + self.idx_reader.num_vals_for_doc(doc) } /// Returns the overall number of values in this field. #[inline] pub fn total_num_vals(&self) -> u64 { - self.idx_reader.max_value() + self.idx_reader.total_num_vals() } } -impl MultiValueLength for MultiValuedU128FastFieldReader { - fn get_range(&self, doc_id: DocId) -> std::ops::Range { - self.range(doc_id) - } - fn get_len(&self, doc_id: DocId) -> u64 { - self.num_vals(doc_id) as u64 - } - fn get_total_len(&self) -> u64 { - self.total_num_vals() as u64 - } -} - -/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds. -/// -/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index. -/// -/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing -/// positions. -/// -/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a -/// docid to its value position. -fn positions_to_docids(positions: &[u32], idx_reader: &C) -> Vec { - let mut docs = vec![]; - let mut cur_doc = 0u32; - let mut last_doc = None; - - for pos in positions { - loop { - let end = idx_reader.get_val(cur_doc + 1) as u32; - if end > *pos { - // avoid duplicates - if Some(cur_doc) == last_doc { - break; - } - docs.push(cur_doc); - last_doc = Some(cur_doc); - break; - } - cur_doc += 1; - } - } - - docs -} - #[cfg(test)] mod tests { - use fastfield_codecs::VecColumn; - use crate::core::Index; - use crate::fastfield::multivalued::reader::positions_to_docids; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; - #[test] - fn test_positions_to_docid() { - let positions = vec![10u32, 11, 15, 20, 21, 22]; - - let offsets = vec![0, 10, 12, 15, 22, 23]; - { - let column = VecColumn::from(&offsets); - - let docids = positions_to_docids(&positions, &column); - assert_eq!(docids, vec![1, 3, 4]); - } - } - #[test] fn test_multifastfield_reader() -> crate::Result<()> { let mut schema_builder = Schema::builder(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 26ec45106..d47a00b5e 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; use crate::fastfield::{ get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, - MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, + MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader, }; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; @@ -348,7 +348,12 @@ impl IndexMerger { field, fast_field_serializer, doc_id_mapping, - &segment_and_ff_readers, + &segment_and_ff_readers + .iter() + .map(|(segment_reader, u64s_reader)| { + (*segment_reader, u64s_reader.get_index_reader()) + }) + .collect::>(), )?; let fast_field_readers = segment_and_ff_readers @@ -529,11 +534,11 @@ impl IndexMerger { // Creating the index file to point into the data, generic over `BytesFastFieldReader` and // `MultiValuedFastFieldReader` // - fn write_1_n_fast_field_idx_generic( + fn write_1_n_fast_field_idx_generic( field: Field, fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &SegmentDocIdMapping, - segment_and_ff_readers: &[(&SegmentReader, T)], + segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)], ) -> crate::Result<()> { let column = RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping); @@ -567,7 +572,12 @@ impl IndexMerger { field, fast_field_serializer, doc_id_mapping, - &segment_and_ff_readers, + &segment_and_ff_readers + .iter() + .map(|(segment_reader, u64s_reader)| { + (*segment_reader, u64s_reader.get_index_reader()) + }) + .collect::>(), ) } @@ -697,7 +707,12 @@ impl IndexMerger { field, fast_field_serializer, doc_id_mapping, - &segment_and_ff_readers, + &segment_and_ff_readers + .iter() + .map(|(segment_reader, u64s_reader)| { + (*segment_reader, u64s_reader.get_index_reader()) + }) + .collect::>(), )?; let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field); diff --git a/src/indexer/sorted_doc_id_multivalue_column.rs b/src/indexer/sorted_doc_id_multivalue_column.rs index 1886a69b1..54f3935a5 100644 --- a/src/indexer/sorted_doc_id_multivalue_column.rs +++ b/src/indexer/sorted_doc_id_multivalue_column.rs @@ -3,7 +3,7 @@ use std::cmp; use fastfield_codecs::Column; use super::flat_map_with_buffer::FlatMapWithBufferIter; -use crate::fastfield::{MultiValueLength, MultiValuedFastFieldReader}; +use crate::fastfield::{MultiValueIndex, MultiValuedFastFieldReader}; use crate::indexer::doc_id_mapping::SegmentDocIdMapping; use crate::schema::Field; use crate::{DocAddress, SegmentReader}; @@ -94,17 +94,17 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { } } -pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> { +pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a> { doc_id_mapping: &'a SegmentDocIdMapping, - multi_value_length_readers: Vec<&'a T>, + multi_value_length_readers: Vec<&'a MultiValueIndex>, min_value: u64, max_value: u64, num_vals: u32, } -impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { +impl<'a> RemappedDocIdMultiValueIndexColumn<'a> { pub(crate) fn new( - segment_and_ff_readers: &'a [(&'a SegmentReader, T)], + segment_and_ff_readers: &'a [(&'a SegmentReader, &'a MultiValueIndex)], doc_id_mapping: &'a SegmentDocIdMapping, ) -> Self { // We go through a complete first pass to compute the minimum and the @@ -115,12 +115,12 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len()); for segment_and_ff_reader in segment_and_ff_readers { let segment_reader = segment_and_ff_reader.0; - let multi_value_length_reader = &segment_and_ff_reader.1; + let multi_value_length_reader = segment_and_ff_reader.1; if !segment_reader.has_deletes() { - max_value += multi_value_length_reader.get_total_len(); + max_value += multi_value_length_reader.total_num_vals(); } else { for doc in segment_reader.doc_ids_alive() { - max_value += multi_value_length_reader.get_len(doc); + max_value += multi_value_length_reader.num_vals_for_doc(doc) as u64; } } num_vals += segment_reader.num_docs(); @@ -136,7 +136,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { } } -impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> { +impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> { fn get_val(&self, _pos: u32) -> u64 { unimplemented!() } @@ -148,8 +148,8 @@ impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIn move |old_doc_addr| { let ff_reader = &self.multi_value_length_readers[old_doc_addr.segment_ord as usize]; - offset += ff_reader.get_len(old_doc_addr.doc_id); - offset + offset += ff_reader.num_vals_for_doc(old_doc_addr.doc_id); + offset as u64 }, )), )