mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 09:00:42 +00:00
Merge pull request #1656 from quickwit-oss/multival_offset_index
move multivalue index to own file
This commit is contained in:
@@ -240,6 +240,7 @@ where
|
||||
// and we do not have any specialized implementation anyway.
|
||||
}
|
||||
|
||||
/// Wraps an iterator into a `Column`.
|
||||
pub struct IterColumn<T>(T);
|
||||
|
||||
impl<T> From<T> for IterColumn<T>
|
||||
|
||||
@@ -41,7 +41,7 @@ mod serialize;
|
||||
|
||||
use self::bitpacked::BitpackedCodec;
|
||||
use self::blockwise_linear::BlockwiseLinearCodec;
|
||||
pub use self::column::{monotonic_map_column, Column, VecColumn};
|
||||
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
|
||||
use self::linear::LinearCodec;
|
||||
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fastfield::MultiValueLength;
|
||||
use crate::fastfield::MultiValueIndex;
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
@@ -19,7 +18,7 @@ use crate::DocId;
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
idx_reader: MultiValueIndex,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
@@ -29,24 +28,26 @@ impl BytesFastFieldReader {
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
Ok(BytesFastFieldReader { idx_reader, values })
|
||||
Ok(BytesFastFieldReader {
|
||||
idx_reader: MultiValueIndex::new(idx_reader),
|
||||
values,
|
||||
})
|
||||
}
|
||||
|
||||
fn range(&self, doc: DocId) -> Range<u32> {
|
||||
let start = self.idx_reader.get_val(doc) as u32;
|
||||
let end = self.idx_reader.get_val(doc + 1) as u32;
|
||||
start..end
|
||||
/// returns the multivalue index
|
||||
pub fn get_index_reader(&self) -> &MultiValueIndex {
|
||||
&self.idx_reader
|
||||
}
|
||||
|
||||
/// Returns the bytes associated with the given `doc`
|
||||
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||
let range = self.range(doc);
|
||||
let range = self.idx_reader.range(doc);
|
||||
&self.values.as_slice()[range.start as usize..range.end as usize]
|
||||
}
|
||||
|
||||
/// Returns the length of the bytes associated with the given `doc`
|
||||
pub fn num_bytes(&self, doc: DocId) -> u64 {
|
||||
let range = self.range(doc);
|
||||
let range = self.idx_reader.range(doc);
|
||||
(range.end - range.start) as u64
|
||||
}
|
||||
|
||||
@@ -55,15 +56,3 @@ impl BytesFastFieldReader {
|
||||
self.values.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueLength for BytesFastFieldReader {
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> {
|
||||
self.range(doc_id)
|
||||
}
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_bytes(doc_id)
|
||||
}
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,8 +27,8 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
|
||||
pub use self::multivalued::{
|
||||
MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValuedFastFieldWriter,
|
||||
MultiValuedU128FastFieldReader,
|
||||
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
|
||||
MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
|
||||
};
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
@@ -36,7 +36,7 @@ pub use self::serializer::{Column, CompositeFastFieldSerializer};
|
||||
use self::writer::unexpected_value;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::{Type, Value};
|
||||
use crate::{DateTime, DocId};
|
||||
use crate::DateTime;
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
@@ -47,17 +47,6 @@ mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
|
||||
/// for a doc_id
|
||||
pub trait MultiValueLength {
|
||||
/// returns the positions for a docid
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32>;
|
||||
/// returns the num of values associated with a doc_id
|
||||
fn get_len(&self, doc_id: DocId) -> u64;
|
||||
/// returns the sum of num values for all doc_ids
|
||||
fn get_total_len(&self) -> u64;
|
||||
}
|
||||
|
||||
/// Trait for types that are allowed for fast fields:
|
||||
/// (u64, i64 and f64, bool, DateTime).
|
||||
pub trait FastValue:
|
||||
|
||||
110
src/fastfield/multivalued/index.rs
Normal file
110
src/fastfield/multivalued/index.rs
Normal file
@@ -0,0 +1,110 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::DocId;
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndex {
|
||||
idx: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl MultiValueIndex {
|
||||
pub(crate) fn new(idx: Arc<dyn Column<u64>>) -> Self {
|
||||
Self { idx }
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc: DocId) -> Range<u32> {
|
||||
let start = self.idx.get_val(doc) as u32;
|
||||
let end = self.idx.get_val(doc + 1) as u32;
|
||||
start..end
|
||||
}
|
||||
|
||||
/// returns the num of values associated with a doc_id
|
||||
pub(crate) fn num_vals_for_doc(&self, doc: DocId) -> u32 {
|
||||
let range = self.range(doc);
|
||||
range.end - range.start
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field.
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx.max_value()
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the index.
|
||||
#[inline]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.idx.num_vals() - 1
|
||||
}
|
||||
|
||||
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||
/// index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||
/// increasing positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
pub(crate) fn positions_to_docids(&self, docid_start: u32, positions: &[u32]) -> Vec<DocId> {
|
||||
let mut docs = vec![];
|
||||
let mut cur_doc = docid_start;
|
||||
let mut last_doc = None;
|
||||
|
||||
for pos in positions {
|
||||
loop {
|
||||
let end = self.idx.get_val(cur_doc + 1) as u32;
|
||||
if end > *pos {
|
||||
// avoid duplicates
|
||||
if Some(cur_doc) == last_doc {
|
||||
break;
|
||||
}
|
||||
docs.push(cur_doc);
|
||||
last_doc = Some(cur_doc);
|
||||
break;
|
||||
}
|
||||
cur_doc += 1;
|
||||
}
|
||||
}
|
||||
|
||||
docs
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::IterColumn;
|
||||
|
||||
use crate::fastfield::MultiValueIndex;
|
||||
|
||||
#[test]
|
||||
fn test_positions_to_docid() {
|
||||
let offsets = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
|
||||
let column = IterColumn::from(offsets.into_iter());
|
||||
let index = MultiValueIndex::new(Arc::new(column));
|
||||
assert_eq!(index.num_docs(), 5);
|
||||
{
|
||||
let positions = vec![10u32, 11, 15, 20, 21, 22];
|
||||
|
||||
assert_eq!(index.positions_to_docids(0, &positions), vec![1, 3, 4]);
|
||||
assert_eq!(index.positions_to_docids(1, &positions), vec![1, 3, 4]);
|
||||
assert_eq!(index.positions_to_docids(0, &[9]), vec![0]);
|
||||
assert_eq!(index.positions_to_docids(1, &[10]), vec![1]);
|
||||
assert_eq!(index.positions_to_docids(1, &[11]), vec![1]);
|
||||
assert_eq!(index.positions_to_docids(2, &[12]), vec![2]);
|
||||
assert_eq!(index.positions_to_docids(2, &[12, 14]), vec![2]);
|
||||
assert_eq!(index.positions_to_docids(2, &[12, 14, 15]), vec![2, 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
mod index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
use fastfield_codecs::FastFieldCodecType;
|
||||
pub use index::MultiValueIndex;
|
||||
|
||||
pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
|
||||
pub(crate) use self::writer::MultivalueStartIndex;
|
||||
|
||||
@@ -3,7 +3,8 @@ use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::{Column, MonotonicallyMappableToU128};
|
||||
|
||||
use crate::fastfield::{FastValue, MultiValueLength};
|
||||
use super::MultiValueIndex;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -15,7 +16,7 @@ use crate::DocId;
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
idx_reader: MultiValueIndex,
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
}
|
||||
|
||||
@@ -25,20 +26,11 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
idx_reader: MultiValueIndex::new(idx_reader),
|
||||
vals_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u32> {
|
||||
let start = self.idx_reader.get_val(doc) as u32;
|
||||
let end = self.idx_reader.get_val(doc + 1) as u32;
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated with the given `doc`.
|
||||
#[inline]
|
||||
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) {
|
||||
@@ -51,10 +43,15 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
/// Returns the array of values associated with the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
let range = self.idx_reader.range(doc);
|
||||
self.get_vals_for_range(range, vals);
|
||||
}
|
||||
|
||||
/// returns the multivalue index
|
||||
pub fn get_index_reader(&self) -> &MultiValueIndex {
|
||||
&self.idx_reader
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
@@ -75,28 +72,14 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
#[inline]
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
pub fn num_vals(&self, doc: DocId) -> u32 {
|
||||
self.idx_reader.num_vals_for_doc(doc)
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
/// Returns the overall number of values in this field.
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
fn get_range(&self, doc_id: DocId) -> Range<u32> {
|
||||
self.range(doc_id)
|
||||
}
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_vals(doc_id) as u64
|
||||
}
|
||||
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_vals() as u64
|
||||
self.idx_reader.total_num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +92,7 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> {
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
idx_reader: MultiValueIndex,
|
||||
vals_reader: Arc<dyn Column<T>>,
|
||||
}
|
||||
|
||||
@@ -119,24 +102,15 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
|
||||
vals_reader: Arc<dyn Column<T>>,
|
||||
) -> MultiValuedU128FastFieldReader<T> {
|
||||
Self {
|
||||
idx_reader,
|
||||
idx_reader: MultiValueIndex::new(idx_reader),
|
||||
vals_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated
|
||||
/// to the given document are `start..end`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u32> {
|
||||
let start = self.idx_reader.get_val(doc) as u32;
|
||||
let end = self.idx_reader.get_val(doc + 1) as u32;
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_first_val(&self, doc: DocId) -> Option<T> {
|
||||
let range = self.range(doc);
|
||||
let range = self.idx_reader.range(doc);
|
||||
if range.is_empty() {
|
||||
return None;
|
||||
}
|
||||
@@ -152,10 +126,15 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
|
||||
.get_range(range.start as u64, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the index reader
|
||||
pub fn get_index_reader(&self) -> &MultiValueIndex {
|
||||
&self.idx_reader
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<T>) {
|
||||
let range = self.range(doc);
|
||||
let range = self.idx_reader.range(doc);
|
||||
self.get_vals_for_range(range, vals);
|
||||
}
|
||||
|
||||
@@ -165,11 +144,11 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
|
||||
value_range: RangeInclusive<T>,
|
||||
doc_id_range: Range<u32>,
|
||||
) -> Vec<DocId> {
|
||||
let mut positions = Vec::new(); // TODO replace
|
||||
let mut positions = Vec::new();
|
||||
self.vals_reader
|
||||
.get_positions_for_value_range(value_range, doc_id_range, &mut positions);
|
||||
|
||||
positions_to_docids(&positions, self.idx_reader.as_ref())
|
||||
self.idx_reader.positions_to_docids(0, &positions)
|
||||
}
|
||||
|
||||
/// Iterates over all elements in the fast field
|
||||
@@ -197,85 +176,23 @@ impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
#[inline]
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
pub fn num_vals(&self, doc: DocId) -> u32 {
|
||||
self.idx_reader.num_vals_for_doc(doc)
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field.
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
self.idx_reader.total_num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: MonotonicallyMappableToU128> MultiValueLength for MultiValuedU128FastFieldReader<T> {
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u32> {
|
||||
self.range(doc_id)
|
||||
}
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_vals(doc_id) as u64
|
||||
}
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_vals() as u64
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically increasing
|
||||
/// positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a expotential search into binary search to match a
|
||||
/// docid to its value position.
|
||||
fn positions_to_docids<C: Column + ?Sized>(positions: &[u32], idx_reader: &C) -> Vec<DocId> {
|
||||
let mut docs = vec![];
|
||||
let mut cur_doc = 0u32;
|
||||
let mut last_doc = None;
|
||||
|
||||
for pos in positions {
|
||||
loop {
|
||||
let end = idx_reader.get_val(cur_doc + 1) as u32;
|
||||
if end > *pos {
|
||||
// avoid duplicates
|
||||
if Some(cur_doc) == last_doc {
|
||||
break;
|
||||
}
|
||||
docs.push(cur_doc);
|
||||
last_doc = Some(cur_doc);
|
||||
break;
|
||||
}
|
||||
cur_doc += 1;
|
||||
}
|
||||
}
|
||||
|
||||
docs
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use fastfield_codecs::VecColumn;
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::multivalued::reader::positions_to_docids;
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_positions_to_docid() {
|
||||
let positions = vec![10u32, 11, 15, 20, 21, 22];
|
||||
|
||||
let offsets = vec![0, 10, 12, 15, 22, 23];
|
||||
{
|
||||
let column = VecColumn::from(&offsets);
|
||||
|
||||
let docids = positions_to_docids(&positions, &column);
|
||||
assert_eq!(docids, vec![1, 3, 4]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{
|
||||
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
|
||||
MultiValueLength, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
|
||||
MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
|
||||
};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
|
||||
@@ -348,7 +348,12 @@ impl IndexMerger {
|
||||
field,
|
||||
fast_field_serializer,
|
||||
doc_id_mapping,
|
||||
&segment_and_ff_readers,
|
||||
&segment_and_ff_readers
|
||||
.iter()
|
||||
.map(|(segment_reader, u64s_reader)| {
|
||||
(*segment_reader, u64s_reader.get_index_reader())
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
)?;
|
||||
|
||||
let fast_field_readers = segment_and_ff_readers
|
||||
@@ -529,11 +534,11 @@ impl IndexMerger {
|
||||
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
|
||||
// `MultiValuedFastFieldReader`
|
||||
//
|
||||
fn write_1_n_fast_field_idx_generic<T: MultiValueLength + Send + Sync>(
|
||||
fn write_1_n_fast_field_idx_generic(
|
||||
field: Field,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
segment_and_ff_readers: &[(&SegmentReader, T)],
|
||||
segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)],
|
||||
) -> crate::Result<()> {
|
||||
let column =
|
||||
RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping);
|
||||
@@ -567,7 +572,12 @@ impl IndexMerger {
|
||||
field,
|
||||
fast_field_serializer,
|
||||
doc_id_mapping,
|
||||
&segment_and_ff_readers,
|
||||
&segment_and_ff_readers
|
||||
.iter()
|
||||
.map(|(segment_reader, u64s_reader)| {
|
||||
(*segment_reader, u64s_reader.get_index_reader())
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -697,7 +707,12 @@ impl IndexMerger {
|
||||
field,
|
||||
fast_field_serializer,
|
||||
doc_id_mapping,
|
||||
&segment_and_ff_readers,
|
||||
&segment_and_ff_readers
|
||||
.iter()
|
||||
.map(|(segment_reader, u64s_reader)| {
|
||||
(*segment_reader, u64s_reader.get_index_reader())
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
)?;
|
||||
|
||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field);
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::cmp;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::flat_map_with_buffer::FlatMapWithBufferIter;
|
||||
use crate::fastfield::{MultiValueLength, MultiValuedFastFieldReader};
|
||||
use crate::fastfield::{MultiValueIndex, MultiValuedFastFieldReader};
|
||||
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
|
||||
use crate::schema::Field;
|
||||
use crate::{DocAddress, SegmentReader};
|
||||
@@ -94,17 +94,17 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> {
|
||||
pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a> {
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
multi_value_length_readers: Vec<&'a T>,
|
||||
multi_value_length_readers: Vec<&'a MultiValueIndex>,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
num_vals: u32,
|
||||
}
|
||||
|
||||
impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
|
||||
impl<'a> RemappedDocIdMultiValueIndexColumn<'a> {
|
||||
pub(crate) fn new(
|
||||
segment_and_ff_readers: &'a [(&'a SegmentReader, T)],
|
||||
segment_and_ff_readers: &'a [(&'a SegmentReader, &'a MultiValueIndex)],
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
) -> Self {
|
||||
// We go through a complete first pass to compute the minimum and the
|
||||
@@ -115,12 +115,12 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
|
||||
let mut multi_value_length_readers = Vec::with_capacity(segment_and_ff_readers.len());
|
||||
for segment_and_ff_reader in segment_and_ff_readers {
|
||||
let segment_reader = segment_and_ff_reader.0;
|
||||
let multi_value_length_reader = &segment_and_ff_reader.1;
|
||||
let multi_value_length_reader = segment_and_ff_reader.1;
|
||||
if !segment_reader.has_deletes() {
|
||||
max_value += multi_value_length_reader.get_total_len();
|
||||
max_value += multi_value_length_reader.total_num_vals();
|
||||
} else {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
max_value += multi_value_length_reader.get_len(doc);
|
||||
max_value += multi_value_length_reader.num_vals_for_doc(doc) as u64;
|
||||
}
|
||||
}
|
||||
num_vals += segment_reader.num_docs();
|
||||
@@ -136,7 +136,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIndexColumn<'a, T> {
|
||||
impl<'a> Column for RemappedDocIdMultiValueIndexColumn<'a> {
|
||||
fn get_val(&self, _pos: u32) -> u64 {
|
||||
unimplemented!()
|
||||
}
|
||||
@@ -148,8 +148,8 @@ impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIn
|
||||
move |old_doc_addr| {
|
||||
let ff_reader =
|
||||
&self.multi_value_length_readers[old_doc_addr.segment_ord as usize];
|
||||
offset += ff_reader.get_len(old_doc_addr.doc_id);
|
||||
offset
|
||||
offset += ff_reader.num_vals_for_doc(old_doc_addr.doc_id);
|
||||
offset as u64
|
||||
},
|
||||
)),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user