move multivalue index to own file

start_doc parameter in positions to docids
This commit is contained in:
Pascal Seitz
2022-10-31 17:55:59 +08:00
parent 4e46f4f8c4
commit 83325d8f3f
9 changed files with 188 additions and 165 deletions

View File

@@ -240,6 +240,7 @@ where
// and we do not have any specialized implementation anyway.
}
/// Wraps anything that returns an iterator into a `Column`.
pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T>

View File

@@ -41,7 +41,7 @@ mod serialize;
use self::bitpacked::BitpackedCodec;
use self::blockwise_linear::BlockwiseLinearCodec;
pub use self::column::{monotonic_map_column, Column, VecColumn};
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
use self::linear::LinearCodec;
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;

View File

@@ -1,10 +1,9 @@
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::MultiValueLength;
use crate::fastfield::MultiValueIndex;
use crate::DocId;
/// Reader for byte array fast fields
@@ -19,7 +18,7 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: Arc<dyn Column<u64>>,
idx_reader: MultiValueIndex,
values: OwnedBytes,
}
@@ -29,24 +28,26 @@ impl BytesFastFieldReader {
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;
Ok(BytesFastFieldReader { idx_reader, values })
Ok(BytesFastFieldReader {
idx_reader: MultiValueIndex::new(idx_reader),
values,
})
}
fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx_reader.get_val(doc) as u32;
let end = self.idx_reader.get_val(doc + 1) as u32;
start..end
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the bytes associated with the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let range = self.range(doc);
let range = self.idx_reader.range(doc);
&self.values.as_slice()[range.start as usize..range.end as usize]
}
/// Returns the length of the bytes associated with the given `doc`
pub fn num_bytes(&self, doc: DocId) -> u64 {
let range = self.range(doc);
let range = self.idx_reader.range(doc);
(range.end - range.start) as u64
}
@@ -55,15 +56,3 @@ impl BytesFastFieldReader {
self.values.len() as u64
}
}
impl MultiValueLength for BytesFastFieldReader {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_bytes(doc_id)
}
fn get_total_len(&self) -> u64 {
self.total_num_bytes()
}
}

View File

@@ -27,8 +27,8 @@ pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
pub use self::multivalued::{
MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter,
MultiValuedU128FastFieldReader,
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
};
pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType};
@@ -36,7 +36,7 @@ pub use self::serializer::{Column, CompositeFastFieldSerializer};
use self::writer::unexpected_value;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::{Type, Value};
use crate::{DateTime, DocId};
use crate::DateTime;
mod alive_bitset;
mod bytes;
@@ -47,17 +47,6 @@ mod readers;
mod serializer;
mod writer;
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
/// for a doc_id
pub trait MultiValueLength {
/// returns the positions for a docid
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32>;
/// returns the num of values associated with a doc_id
fn get_len(&self, doc_id: DocId) -> u64;
/// returns the sum of num values for all doc_ids
fn get_total_len(&self) -> u64;
}
/// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime).
pub trait FastValue:

View File

@@ -0,0 +1,110 @@
use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::DocId;
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndex {
idx: Arc<dyn Column<u64>>,
}
impl MultiValueIndex {
pub(crate) fn new(idx: Arc<dyn Column<u64>>) -> Self {
Self { idx }
}
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx.get_val(doc) as u32;
let end = self.idx.get_val(doc + 1) as u32;
start..end
}
/// returns the num of values associated with a doc_id
pub(crate) fn num_vals_for_doc(&self, doc: DocId) -> u32 {
let range = self.range(doc);
range.end - range.start
}
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx.max_value()
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_docs(&self) -> u32 {
self.idx.num_vals() - 1
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
pub(crate) fn positions_to_docids(&self, docid_start: u32, positions: &[u32]) -> Vec<DocId> {
let mut docs = vec![];
let mut cur_doc = docid_start;
let mut last_doc = None;
for pos in positions {
loop {
let end = self.idx.get_val(cur_doc + 1) as u32;
if end > *pos {
// avoid duplicates
if Some(cur_doc) == last_doc {
break;
}
docs.push(cur_doc);
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
docs
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use fastfield_codecs::IterColumn;
use crate::fastfield::MultiValueIndex;
#[test]
fn test_positions_to_docid() {
let offsets = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column = IterColumn::from(offsets.into_iter());
let index = MultiValueIndex::new(Arc::new(column));
assert_eq!(index.num_docs(), 5);
{
let positions = vec![10u32, 11, 15, 20, 21, 22];
assert_eq!(index.positions_to_docids(0, &positions), vec![1, 3, 4]);
assert_eq!(index.positions_to_docids(1, &positions), vec![1, 3, 4]);
assert_eq!(index.positions_to_docids(0, &[9]), vec![0]);
assert_eq!(index.positions_to_docids(1, &[10]), vec![1]);
assert_eq!(index.positions_to_docids(1, &[11]), vec![1]);
assert_eq!(index.positions_to_docids(2, &[12]), vec![2]);
assert_eq!(index.positions_to_docids(2, &[12, 14]), vec![2]);
assert_eq!(index.positions_to_docids(2, &[12, 14, 15]), vec![2, 3]);
}
}
}

View File

@@ -1,7 +1,9 @@
mod index;
mod reader;
mod writer;
use fastfield_codecs::FastFieldCodecType;
pub use index::MultiValueIndex;
pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
pub(crate) use self::writer::MultivalueStartIndex;

View File

@@ -3,7 +3,8 @@ use std::sync::Arc;
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use crate::fastfield::{FastValue, MultiValueLength};
use super::MultiValueIndex;
use crate::fastfield::FastValue;
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -15,7 +16,7 @@ use crate::DocId;
/// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: Arc<dyn Column<u64>>,
idx_reader: MultiValueIndex,
vals_reader: Arc<dyn Column<Item>>,
}
@@ -25,20 +26,11 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
vals_reader: Arc<dyn Column<Item>>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
idx_reader: MultiValueIndex::new(idx_reader),
vals_reader,
}
}
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx_reader.get_val(doc) as u32;
let end = self.idx_reader.get_val(doc + 1) as u32;
start..end
}
/// Returns the array of values associated with the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) {
@@ -51,10 +43,15 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns the array of values associated with the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let range = self.idx_reader.range(doc);
self.get_vals_for_range(range, vals);
}
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
@@ -75,28 +72,14 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
pub fn num_vals(&self, doc: DocId) -> u32 {
self.idx_reader.num_vals_for_doc(doc)
}
/// Returns the overall number of values in this field .
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
}
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
fn get_range(&self, doc_id: DocId) -> Range<u32> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
self.idx_reader.total_num_vals()
}
}
@@ -109,7 +92,7 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
/// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)]
pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> {
idx_reader: Arc<dyn Column<u64>>,
idx_reader: MultiValueIndex,
vals_reader: Arc<dyn Column<T>>,
}
@@ -119,24 +102,15 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
vals_reader: Arc<dyn Column<T>>,
) -> MultiValuedU128FastFieldReader<T> {
Self {
idx_reader,
idx_reader: MultiValueIndex::new(idx_reader),
vals_reader,
}
}
/// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u32> {
let start = self.idx_reader.get_val(doc) as u32;
let end = self.idx_reader.get_val(doc + 1) as u32;
start..end
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_first_val(&self, doc: DocId) -> Option<T> {
let range = self.range(doc);
let range = self.idx_reader.range(doc);
if range.is_empty() {
return None;
}
@@ -152,10 +126,15 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
.get_range(range.start as u64, &mut vals[..]);
}
/// Returns the index reader
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<T>) {
let range = self.range(doc);
let range = self.idx_reader.range(doc);
self.get_vals_for_range(range, vals);
}
@@ -165,11 +144,11 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
) -> Vec<DocId> {
let mut positions = Vec::new(); // TODO replace
let mut positions = Vec::new();
self.vals_reader
.get_positions_for_value_range(value_range, doc_id_range, &mut positions);
positions_to_docids(&positions, self.idx_reader.as_ref())
self.idx_reader.positions_to_docids(0, &positions)
}
/// Iterates over all elements in the fast field
@@ -197,85 +176,23 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
pub fn num_vals(&self, doc: DocId) -> u32 {
self.idx_reader.num_vals_for_doc(doc)
}
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
self.idx_reader.total_num_vals()
}
}
impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> {
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> {
self.range(doc_id)
}
fn get_len(&self, doc_id: DocId) -> u64 {
self.num_vals(doc_id) as u64
}
fn get_total_len(&self) -> u64 {
self.total_num_vals() as u64
}
}
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing
/// positions.
///
/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a
/// docid to its value position.
fn positions_to_docids<C: Column + ?Sized>(positions: &[u32], idx_reader: &C) -> Vec<DocId> {
let mut docs = vec![];
let mut cur_doc = 0u32;
let mut last_doc = None;
for pos in positions {
loop {
let end = idx_reader.get_val(cur_doc + 1) as u32;
if end > *pos {
// avoid duplicates
if Some(cur_doc) == last_doc {
break;
}
docs.push(cur_doc);
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
docs
}
#[cfg(test)]
mod tests {
use fastfield_codecs::VecColumn;
use crate::core::Index;
use crate::fastfield::multivalued::reader::positions_to_docids;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
#[test]
fn test_positions_to_docid() {
let positions = vec![10u32, 11, 15, 20, 21, 22];
let offsets = vec![0, 10, 12, 15, 22, 23];
{
let column = VecColumn::from(&offsets);
let docids = positions_to_docids(&positions, &column);
assert_eq!(docids, vec![1, 3, 4]);
}
}
#[test]
fn test_multifastfield_reader() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -348,7 +348,12 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?;
let fast_field_readers = segment_and_ff_readers
@@ -529,11 +534,11 @@ impl IndexMerger {
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
// `MultiValuedFastFieldReader`
//
fn write_1_n_fast_field_idx_generic<T: MultiValueLength + Send + Sync>(
fn write_1_n_fast_field_idx_generic(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
segment_and_ff_readers: &[(&SegmentReader, T)],
segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)],
) -> crate::Result<()> {
let column =
RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping);
@@ -567,7 +572,12 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)
}
@@ -697,7 +707,12 @@ impl IndexMerger {
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field);

View File

@@ -3,7 +3,7 @@ use std::cmp;
use fastfield_codecs::Column;
use super::flat_map_with_buffer::FlatMapWithBufferIter;
use crate::fastfield::{MultiValueLength, MultiValuedFastFieldReader};
use crate::fastfield::{MultiValueIndex, MultiValuedFastFieldReader};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::schema::Field;
use crate::{DocAddress, SegmentReader};
@@ -94,17 +94,17 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
}
}
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> {
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a> {
doc_id_mapping: &'a SegmentDocIdMapping,
multi_value_length_readers: Vec<&'a T>,
multi_value_length_readers: Vec<&'a MultiValueIndex>,
min_value: u64,
max_value: u64,
num_vals: u32,
}
impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
pub(crate) fn new(
segment_and_ff_readers: &'a [(&'a SegmentReader, T)],
segment_and_ff_readers: &'a [(&'a SegmentReader, &'a MultiValueIndex)],
doc_id_mapping: &'a SegmentDocIdMapping,
) -> Self {
// We go through a complete first pass to compute the minimum and the
@@ -115,12 +115,12 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len());
for segment_and_ff_reader in segment_and_ff_readers {
let segment_reader = segment_and_ff_reader.0;
let multi_value_length_reader = &segment_and_ff_reader.1;
let multi_value_length_reader = segment_and_ff_reader.1;
if !segment_reader.has_deletes() {
max_value += multi_value_length_reader.get_total_len();
max_value += multi_value_length_reader.total_num_vals();
} else {
for doc in segment_reader.doc_ids_alive() {
max_value += multi_value_length_reader.get_len(doc);
max_value += multi_value_length_reader.num_vals_for_doc(doc) as u64;
}
}
num_vals += segment_reader.num_docs();
@@ -136,7 +136,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
}
}
impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> {
impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
fn get_val(&self, _pos: u32) -> u64 {
unimplemented!()
}
@@ -148,8 +148,8 @@ impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIn
move |old_doc_addr| {
let ff_reader =
&self.multi_value_length_readers[old_doc_addr.segment_ord as usize];
offset += ff_reader.get_len(old_doc_addr.doc_id);
offset
offset += ff_reader.num_vals_for_doc(old_doc_addr.doc_id);
offset as u64
},
)),
)