From 88ed3d8b48abba4a466eebf3f56be0870155cd14 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 2 Feb 2023 12:46:39 +0100 Subject: [PATCH] Switching back to iterable. --- columnar/src/column/serialize.rs | 16 ++-- .../src/column_index/multivalued_index.rs | 2 +- columnar/src/column_values/column.rs | 2 +- columnar/src/column_values/serialize.rs | 58 +++++------ .../u64_based/blockwise_linear.rs | 2 +- columnar/src/column_values/u64_based/mod.rs | 16 ++-- columnar/src/column_values/u64_based/tests.rs | 23 +++-- .../src/columnar/merge/merge_dict_column.rs | 50 ++++------ columnar/src/columnar/merge/merge_mapping.rs | 2 +- columnar/src/columnar/merge/mod.rs | 7 +- columnar/src/columnar/merge/tests.rs | 6 +- columnar/src/columnar/writer/mod.rs | 2 +- columnar/src/iterable.rs | 52 ---------- columnar/src/lib.rs | 1 - src/indexer/doc_id_mapping.rs | 5 + src/indexer/merger.rs | 95 ++----------------- 16 files changed, 92 insertions(+), 247 deletions(-) diff --git a/columnar/src/column/serialize.rs b/columnar/src/column/serialize.rs index cef3e8490..465d319d3 100644 --- a/columnar/src/column/serialize.rs +++ b/columnar/src/column/serialize.rs @@ -11,20 +11,16 @@ use crate::column_index::{serialize_column_index, SerializableColumnIndex}; use crate::column_values::serialize::serialize_column_values_u128; use crate::column_values::u64_based::{serialize_u64_based_column_values, CodecType}; use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; -use crate::iterable::{map_iterable, Iterable}; +use crate::iterable::Iterable; -pub fn serialize_column_mappable_to_u128( +pub fn serialize_column_mappable_to_u128( column_index: SerializableColumnIndex<'_>, - iterable: &dyn Fn() -> I, + iterable: &dyn Iterable, num_vals: u32, output: &mut impl Write, -) -> io::Result<()> -where - I: Iterator, -{ +) -> io::Result<()> { let column_index_num_bytes = serialize_column_index(column_index, output)?; - let u128_iterable = map_iterable(iterable, MonotonicallyMappableToU128::to_u128); - serialize_column_values_u128(&u128_iterable, num_vals, output)?; + serialize_column_values_u128(iterable, num_vals, output)?; output.write_all(&column_index_num_bytes.to_le_bytes())?; Ok(()) } @@ -36,7 +32,7 @@ pub fn serialize_column_mappable_to_u64( ) -> io::Result<()> { let column_index_num_bytes = serialize_column_index(column_index, output)?; serialize_u64_based_column_values( - || column_values.boxed_iter(), + column_values, &[CodecType::Bitpacked, CodecType::BlockwiseLinear], output, )?; diff --git a/columnar/src/column_index/multivalued_index.rs b/columnar/src/column_index/multivalued_index.rs index bfff83960..801fc05fd 100644 --- a/columnar/src/column_index/multivalued_index.rs +++ b/columnar/src/column_index/multivalued_index.rs @@ -15,7 +15,7 @@ pub fn serialize_multivalued_index( output: &mut impl Write, ) -> io::Result<()> { crate::column_values::u64_based::serialize_u64_based_column_values( - || multivalued_index.boxed_iter(), + multivalued_index, &[CodecType::Bitpacked, CodecType::Linear], output, )?; diff --git a/columnar/src/column_values/column.rs b/columnar/src/column_values/column.rs index 1a742436d..8fbe43f5f 100644 --- a/columnar/src/column_values/column.rs +++ b/columnar/src/column_values/column.rs @@ -80,7 +80,7 @@ pub trait ColumnValues: Send + Sync { } } -impl<'a, T: Ord> Iterable for &'a [Arc>] { +impl<'a, T: PartialOrd> Iterable for &'a [Arc>] { fn boxed_iter(&self) -> Box + '_> { Box::new(self.iter().flat_map(|column_value| column_value.iter())) } diff --git a/columnar/src/column_values/serialize.rs b/columnar/src/column_values/serialize.rs index e30eede1c..0b41d475b 100644 --- a/columnar/src/column_values/serialize.rs +++ b/columnar/src/column_values/serialize.rs @@ -1,21 +1,12 @@ use std::fmt::Debug; use std::io; -use std::num::NonZeroU64; use common::{BinarySerializable, VInt}; -use log::warn; -use super::monotonic_mapping::{ - StrictlyMonotonicFn, StrictlyMonotonicMappingToInternal, - StrictlyMonotonicMappingToInternalGCDBaseval, -}; -use super::{ - monotonic_map_column, u64_based, ColumnValues, MonotonicallyMappableToU64, - U128FastFieldCodecType, -}; use crate::column_values::compact_space::CompactSpaceCompressor; -use crate::column_values::u64_based::CodecType; +use crate::column_values::U128FastFieldCodecType; use crate::iterable::Iterable; +use crate::MonotonicallyMappableToU128; /// The normalized header gives some parameters after applying the following /// normalization of the vector: @@ -53,19 +44,9 @@ impl BinarySerializable for U128Header { } } -fn normalize_column( - from_column: C, - min_value: u64, - gcd: Option, -) -> impl ColumnValues { - let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1); - let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, min_value); - monotonic_map_column(from_column, mapping) -} - /// Serializes u128 values with the compact space codec. -pub fn serialize_column_values_u128>( - iterable: &dyn Fn() -> I, +pub fn serialize_column_values_u128( + iterable: &dyn Iterable, num_vals: u32, output: &mut impl io::Write, ) -> io::Result<()> { @@ -74,9 +55,18 @@ pub fn serialize_column_values_u128>( codec_type: U128FastFieldCodecType::CompactSpace, }; header.serialize(output)?; - let compressor = CompactSpaceCompressor::train_from(iterable(), num_vals); - compressor.compress_into(iterable(), output)?; - + let compressor = CompactSpaceCompressor::train_from( + iterable + .boxed_iter() + .map(MonotonicallyMappableToU128::to_u128), + num_vals, + ); + compressor.compress_into( + iterable + .boxed_iter() + .map(MonotonicallyMappableToU128::to_u128), + output, + )?; Ok(()) } @@ -113,8 +103,8 @@ pub mod tests { #[test] fn test_fastfield_bool_size_bitwidth_1() { let mut buffer = Vec::new(); - serialize_u64_based_column_values( - || [false, true].into_iter(), + serialize_u64_based_column_values::( + &&[false, true][..], &ALL_U64_CODEC_TYPES, &mut buffer, ) @@ -127,8 +117,8 @@ pub mod tests { #[test] fn test_fastfield_bool_bit_size_bitwidth_0() { let mut buffer = Vec::new(); - serialize_u64_based_column_values( - || [false, true].into_iter(), + serialize_u64_based_column_values::( + &&[false, true][..], &ALL_U64_CODEC_TYPES, &mut buffer, ) @@ -141,12 +131,8 @@ pub mod tests { fn test_fastfield_gcd() { let mut buffer = Vec::new(); let vals: Vec = (0..80).map(|val| (val % 7) * 1_000u64).collect(); - serialize_u64_based_column_values( - || vals.iter().cloned(), - &[CodecType::Bitpacked], - &mut buffer, - ) - .unwrap(); + serialize_u64_based_column_values(&&vals[..], &[CodecType::Bitpacked], &mut buffer) + .unwrap(); // Values are stored over 3 bits. assert_eq!(buffer.len(), 6 + (3 * 80 / 8)); } diff --git a/columnar/src/column_values/u64_based/blockwise_linear.rs b/columnar/src/column_values/u64_based/blockwise_linear.rs index 810ebc9cb..4945e3418 100644 --- a/columnar/src/column_values/u64_based/blockwise_linear.rs +++ b/columnar/src/column_values/u64_based/blockwise_linear.rs @@ -125,7 +125,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator { *buffer_val = gcd_divider.divide(*buffer_val - stats.min_value); } - let mut line = Line::train(&VecColumn::from(&buffer)); + let line = Line::train(&VecColumn::from(&buffer)); assert!(!buffer.is_empty()); diff --git a/columnar/src/column_values/u64_based/mod.rs b/columnar/src/column_values/u64_based/mod.rs index 8d58ea6f4..909bffa27 100644 --- a/columnar/src/column_values/u64_based/mod.rs +++ b/columnar/src/column_values/u64_based/mod.rs @@ -115,22 +115,18 @@ impl CodecType { } } -pub fn serialize_u64_based_column_values( - vals: F, +pub fn serialize_u64_based_column_values<'a, T: MonotonicallyMappableToU64>( + vals: &dyn Iterable, codec_types: &[CodecType], wrt: &mut dyn Write, -) -> io::Result<()> -where - I: Iterator, - F: Fn() -> I, -{ +) -> io::Result<()> { let mut stats_collector = StatsCollector::default(); let mut estimators: Vec<(CodecType, Box)> = Vec::with_capacity(codec_types.len()); for &codec_type in codec_types { estimators.push((codec_type, codec_type.estimator())); } - for val in vals() { + for val in vals.boxed_iter() { let val_u64 = val.to_u64(); stats_collector.collect(val_u64); for (_, estimator) in &mut estimators { @@ -154,7 +150,7 @@ where best_codec.to_code().serialize(wrt)?; best_codec_estimator.serialize( &stats, - &mut vals().map(MonotonicallyMappableToU64::to_u64), + &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64), wrt, )?; Ok(()) @@ -178,7 +174,7 @@ pub fn serialize_and_load_u64_based_column_values codec_types: &[CodecType], ) -> Arc> { let mut buffer = Vec::new(); - serialize_u64_based_column_values(|| vals.boxed_iter(), codec_types, &mut buffer).unwrap(); + serialize_u64_based_column_values(vals, codec_types, &mut buffer).unwrap(); load_u64_based_column_values::(OwnedBytes::new(buffer)).unwrap() } diff --git a/columnar/src/column_values/u64_based/tests.rs b/columnar/src/column_values/u64_based/tests.rs index b82cdf349..b9bea754b 100644 --- a/columnar/src/column_values/u64_based/tests.rs +++ b/columnar/src/column_values/u64_based/tests.rs @@ -7,7 +7,7 @@ fn test_serialize_and_load_simple() { let mut buffer = Vec::new(); let vals = &[1u64, 2u64, 5u64]; serialize_u64_based_column_values( - || vals.iter().cloned(), + &&vals[..], &[CodecType::Bitpacked, CodecType::BlockwiseLinear], &mut buffer, ) @@ -67,9 +67,7 @@ pub(crate) fn create_and_validate( ); assert_eq!(expected_positions, positions); } - dbg!(estimation); - dbg!(actual_compression); - if actual_compression > 20 { + if actual_compression > 1000 { assert!(relative_difference(estimation, actual_compression) < 0.10f32); } Some(( @@ -101,12 +99,21 @@ proptest! { create_and_validate::(&data, "proptest linearinterpol"); } + #[test] fn test_proptest_small_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..10)) { create_and_validate::(&data, "proptest multilinearinterpol"); } } +#[test] +fn test_small_blockwise_linear_example() { + create_and_validate::( + &[9223372036854775808, 9223370937344622593], + "proptest multilinearinterpol", + ); +} + proptest! { #![proptest_config(ProptestConfig::with_cases(10))] @@ -245,7 +252,7 @@ fn test_fastfield_gcd_i64_with_codec(codec_type: CodecType, num_vals: usize) -> let mut vals: Vec = (-4..=(num_vals as i64) - 5).map(|val| val * 1000).collect(); let mut buffer: Vec = Vec::new(); crate::column_values::serialize_u64_based_column_values( - || vals.iter().cloned(), + &&vals[..], &[codec_type], &mut buffer, )?; @@ -262,7 +269,7 @@ fn test_fastfield_gcd_i64_with_codec(codec_type: CodecType, num_vals: usize) -> vals.pop(); vals.push(1001i64); crate::column_values::serialize_u64_based_column_values( - || vals.iter().cloned(), + &&vals[..], &[codec_type], &mut buffer_without_gcd, )?; @@ -288,7 +295,7 @@ fn test_fastfield_gcd_u64_with_codec(codec_type: CodecType, num_vals: usize) -> let mut vals: Vec = (1..=num_vals).map(|i| i as u64 * 1000u64).collect(); let mut buffer: Vec = Vec::new(); crate::column_values::serialize_u64_based_column_values( - || vals.iter().cloned(), + &&vals[..], &[codec_type], &mut buffer, )?; @@ -305,7 +312,7 @@ fn test_fastfield_gcd_u64_with_codec(codec_type: CodecType, num_vals: usize) -> vals.pop(); vals.push(1001u64); crate::column_values::serialize_u64_based_column_values( - || vals.iter().cloned(), + &&vals[..], &[codec_type], &mut buffer_without_gcd, )?; diff --git a/columnar/src/columnar/merge/merge_dict_column.rs b/columnar/src/columnar/merge/merge_dict_column.rs index 4da10542e..9e6d32451 100644 --- a/columnar/src/columnar/merge/merge_dict_column.rs +++ b/columnar/src/columnar/merge/merge_dict_column.rs @@ -1,12 +1,12 @@ use std::io::{self, Write}; use common::CountingWriter; -use itertools::Itertools; use sstable::{SSTable, TermOrdinal}; use super::term_merger::TermMerger; -use crate::column_index::{serialize_column_index, SerializableColumnIndex}; -use crate::column_values::{serialize_u64_based_column_values, CodecType}; +use crate::column::serialize_column_mappable_to_u64; +use crate::column_index::SerializableColumnIndex; +use crate::iterable::Iterable; use crate::BytesColumn; // Serialize [Dictionary, Column, dictionary num bytes U32::LE] @@ -21,45 +21,38 @@ pub fn merge_bytes_or_str_column( let term_ord_mapping = serialize_merged_dict(bytes_columns, &mut output)?; let dictionary_num_bytes: u32 = output.written_bytes() as u32; let output = output.finish(); - - serialize_bytes_or_str_column(column_index, bytes_columns, &term_ord_mapping, output)?; - + let remapped_term_ordinals_values = RemappedTermOrdinalsValues { + bytes_columns, + term_ord_mapping: &term_ord_mapping, + }; + serialize_column_mappable_to_u64(column_index, &remapped_term_ordinals_values, output)?; + // serialize_bytes_or_str_column(column_index, bytes_columns, &term_ord_mapping, output)?; output.write_all(&dictionary_num_bytes.to_le_bytes())?; Ok(()) } -fn serialize_bytes_or_str_column( - column_index: SerializableColumnIndex<'_>, - bytes_columns: &[BytesColumn], - term_ord_mapping: &TermOrdinalMapping, - output: &mut impl Write, -) -> io::Result<()> { - let column_index_num_bytes = serialize_column_index(column_index, output)?; +struct RemappedTermOrdinalsValues<'a> { + bytes_columns: &'a [BytesColumn], + term_ord_mapping: &'a TermOrdinalMapping, +} - let column_values = move || { - let iter = bytes_columns +impl<'a> Iterable for RemappedTermOrdinalsValues<'a> { + fn boxed_iter(&self) -> Box + '_> { + let iter = self + .bytes_columns .iter() .enumerate() .flat_map(|(segment_ord, byte_column)| { - let segment_ord = term_ord_mapping.get_segment(segment_ord); + let segment_ord = self.term_ord_mapping.get_segment(segment_ord); byte_column .ords() .values .iter() .map(move |term_ord| segment_ord[term_ord as usize]) }); - iter - }; - - serialize_u64_based_column_values( - column_values, - &[CodecType::Bitpacked, CodecType::BlockwiseLinear], - output, - )?; - - output.write_all(&column_index_num_bytes.to_le_bytes())?; - - Ok(()) + // TODO see if we can better decompose the mapping / and the stacking + Box::new(iter) + } } fn serialize_merged_dict( @@ -89,7 +82,6 @@ fn serialize_merged_dict( current_term_ord += 1; } sstable_builder.finish()?; - Ok(term_ord_mapping) } diff --git a/columnar/src/columnar/merge/merge_mapping.rs b/columnar/src/columnar/merge/merge_mapping.rs index 48938266a..b9d2d6ab8 100644 --- a/columnar/src/columnar/merge/merge_mapping.rs +++ b/columnar/src/columnar/merge/merge_mapping.rs @@ -9,7 +9,7 @@ pub struct StackMergeOrder { } impl StackMergeOrder { - pub fn from_columnars(columnars: &[&ColumnarReader]) -> StackMergeOrder { + pub fn stack(columnars: &[&ColumnarReader]) -> StackMergeOrder { let mut cumulated_row_ids: Vec = Vec::with_capacity(columnars.len()); let mut cumulated_row_id = 0; for columnar in columnars { diff --git a/columnar/src/columnar/merge/mod.rs b/columnar/src/columnar/merge/mod.rs index c3d599aa1..da82182fe 100644 --- a/columnar/src/columnar/merge/mod.rs +++ b/columnar/src/columnar/merge/mod.rs @@ -13,6 +13,7 @@ pub use merge_mapping::{MergeRowOrder, StackMergeOrder}; use super::writer::ColumnarSerializer; use crate::column::{serialize_column_mappable_to_u128, serialize_column_mappable_to_u64}; +use crate::column_index::stack_column_index; use crate::columnar::column_type::ColumnTypeCategory; use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column; use crate::columnar::writer::CompatibleNumericalTypes; @@ -98,11 +99,7 @@ pub fn merge_column( crate::column_index::stack_column_index(&column_indexes[..], merge_row_order); serialize_column_mappable_to_u128( merged_column_index, - &|| { - column_values - .iter() - .flat_map(|column_value| column_value.iter()) - }, + &&column_values[..], num_values, wrt, )?; diff --git a/columnar/src/columnar/merge/tests.rs b/columnar/src/columnar/merge/tests.rs index 851617b29..48da5f567 100644 --- a/columnar/src/columnar/merge/tests.rs +++ b/columnar/src/columnar/merge/tests.rs @@ -142,7 +142,7 @@ fn test_merge_columnar_numbers() { )]); let mut buffer = Vec::new(); let columnars = &[&columnar1, &columnar2]; - let stack_merge_order = StackMergeOrder::from_columnars(columnars); + let stack_merge_order = StackMergeOrder::stack(columnars); crate::columnar::merge_columnar( columnars, MergeRowOrder::Stack(stack_merge_order), @@ -167,7 +167,7 @@ fn test_merge_columnar_texts() { let columnar2 = make_text_columnar_multiple_columns(&[("texts", &[&[], &["b"]])]); let mut buffer = Vec::new(); let columnars = &[&columnar1, &columnar2]; - let stack_merge_order = StackMergeOrder::from_columnars(columnars); + let stack_merge_order = StackMergeOrder::stack(columnars); crate::columnar::merge_columnar( columnars, MergeRowOrder::Stack(stack_merge_order), @@ -211,7 +211,7 @@ fn test_merge_columnar_byte() { let columnar2 = make_byte_columnar_multiple_columns(&[("bytes", &[&[], &[b"a"]])]); let mut buffer = Vec::new(); let columnars = &[&columnar1, &columnar2]; - let stack_merge_order = StackMergeOrder::from_columnars(columnars); + let stack_merge_order = StackMergeOrder::stack(columnars); crate::columnar::merge_columnar( columnars, MergeRowOrder::Stack(stack_merge_order), diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs index ac150b84c..4e9a9f5c0 100644 --- a/columnar/src/columnar/writer/mod.rs +++ b/columnar/src/columnar/writer/mod.rs @@ -587,7 +587,7 @@ where }; crate::column::serialize_column_mappable_to_u128( serializable_column_index, - &|| values.iter().copied(), + &&values[..], values.len() as u32, &mut wrt, )?; diff --git a/columnar/src/iterable.rs b/columnar/src/iterable.rs index fdc1ce1f4..ec9c88665 100644 --- a/columnar/src/iterable.rs +++ b/columnar/src/iterable.rs @@ -1,61 +1,9 @@ -use std::iter::Map; -use std::marker::PhantomData; use std::ops::Range; pub trait Iterable { fn boxed_iter(&self) -> Box + '_>; } -struct Mapped { - original_iterable: Original, - transform: Transform, - input_type: PhantomData, -} - -impl Iterable for Mapped -where - Original: Iterable, - Transform: Fn(U) -> V, -{ - fn boxed_iter(&self) -> Box + '_> { - Box::new(self.original_iterable.boxed_iter().map(&self.transform)) - } -} - -impl Iterable for &dyn Iterable { - fn boxed_iter(&self) -> Box + '_> { - (*self).boxed_iter() - } -} - -impl Iterable for F -where F: Fn() -> Box> -{ - fn boxed_iter(&self) -> Box + '_> { - self() - } -} - -// impl Iterable for F -// where -// I: Iterator, -// F: Fn() -> I, -//{ -// fn boxed_iter(&self) -> Box + '_> { -// Box::new(self()) -//} - -pub fn map_iterable( - original_iterable: impl Fn() -> I, - transform: F, -) -> impl Fn() -> std::iter::Map -where - F: Fn(U) -> V + Clone, - I: Iterator, -{ - move || original_iterable().map(transform.clone()) -} - impl<'a, T: Copy> Iterable for &'a [T] { fn boxed_iter(&self) -> Box + '_> { Box::new(self.iter().copied()) diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs index 05482a3da..b6380fd05 100644 --- a/columnar/src/lib.rs +++ b/columnar/src/lib.rs @@ -26,7 +26,6 @@ pub use columnar::{ merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType, MergeRowOrder, StackMergeOrder, }; -pub(crate) use iterable::{map_iterable, Iterable}; use sstable::VoidSSTable; pub use value::{NumericalType, NumericalValue}; diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index 86e8caf5e..e1d64b036 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -37,6 +37,11 @@ impl SegmentDocIdMapping { /// This flags means the segments are simply stacked in the order of their ordinal. /// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)] /// + /// The different segment may present some deletes, in which case it is expressed by skipping a + /// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted + /// + /// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted. + /// /// This allows for some optimization. pub(crate) fn is_trivial(&self) -> bool { self.is_trivial diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b2725b464..8b478c074 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -86,29 +86,6 @@ pub struct IndexMerger { max_doc: u32, } -struct TermOrdinalMapping { - per_segment_new_term_ordinals: Vec>, -} - -impl TermOrdinalMapping { - fn new(max_term_ords: Vec) -> TermOrdinalMapping { - TermOrdinalMapping { - per_segment_new_term_ordinals: max_term_ords - .into_iter() - .map(|max_term_ord| vec![TermOrdinal::default(); max_term_ord as usize]) - .collect(), - } - } - - fn register_from_to(&mut self, segment_ord: usize, from_ord: TermOrdinal, to_ord: TermOrdinal) { - self.per_segment_new_term_ordinals[segment_ord][from_ord as usize] = to_ord; - } - - fn get_segment(&self, segment_ord: usize) -> &[TermOrdinal] { - &(self.per_segment_new_term_ordinals[segment_ord])[..] - } -} - struct DeltaComputer { buffer: Vec, } @@ -257,59 +234,8 @@ impl IndexMerger { if !doc_id_mapping.is_trivial() { todo!() } - let merge_row_order = MergeRowOrder::Stack(StackMergeOrder::from_columnars(&columnars[..])); + let merge_row_order = MergeRowOrder::Stack(StackMergeOrder::stack(&columnars[..])); columnar::merge_columnar(&columnars[..], merge_row_order, fast_field_wrt)?; - // for (field, field_entry) in self.schema.fields() { - // let field_type = field_entry.field_type(); - // match field_type { - // FieldType::Facet(_) | FieldType::Str(_) if field_type.is_fast() => { - // let term_ordinal_mapping = term_ord_mappings.remove(&field).expect( - // "Logic Error in Tantivy (Please report). Facet field should have required \ - // a`term_ordinal_mapping`.", - // ); - // self.write_term_id_fast_field( - // field, - // &term_ordinal_mapping, - // fast_field_serializer, - // doc_id_mapping, - // )?; - // } - // FieldType::U64(ref options) - // | FieldType::I64(ref options) - // | FieldType::F64(ref options) - // | FieldType::Bool(ref options) => { - // todo!() - // } - // FieldType::Date(ref options) => { - // if options.is_fast() { - // todo!(); - // } - // Some(Cardinality::SingleValue) => { - // self.write_single_fast_field(field, fast_field_serializer, doc_id_mapping)?; - // } - // Some(Cardinality::MultiValues) => { - // self.write_multi_fast_field(field, fast_field_serializer, doc_id_mapping)?; - // } - // None => {} - // }, - // FieldType::Bytes(byte_options) => { - // if byte_options.is_fast() { - // self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; - // } - // } - // FieldType::IpAddr(options) => { - // if options.is_fast() { - // todo!(); - // } - // }, - // - // FieldType::JsonObject(_) | FieldType::Facet(_) | FieldType::Str(_) => { - // We don't handle json fast field for the moment - // They can be implemented using what is done - // for facets in the future - // } - // } - // } Ok(()) } @@ -374,7 +300,7 @@ impl IndexMerger { /// doc_id. /// ReaderWithOrdinal will include the ordinal position of the /// reader in self.readers. - pub(crate) fn generate_doc_id_mapping( + pub(crate) fn generate_doc_id_mapping_with_sort_by_field( &self, sort_by_field: &IndexSortByField, ) -> crate::Result { @@ -454,7 +380,7 @@ impl IndexMerger { serializer: &mut InvertedIndexSerializer, fieldnorm_reader: Option, doc_id_mapping: &SegmentDocIdMapping, - ) -> crate::Result> { + ) -> crate::Result<()> { debug_time!("write-postings-for-field"); let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); @@ -566,12 +492,6 @@ impl IndexMerger { let to_term_ord = field_serializer.new_term(term_bytes, total_doc_freq)?; - if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt { - for (segment_ord, from_term_ord) in merged_terms.matching_segments() { - term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord); - } - } - // We can now serialize this postings, by pushing each document to the // postings serializer. for (segment_ord, mut segment_postings) in @@ -622,7 +542,7 @@ impl IndexMerger { field_serializer.close_term()?; } field_serializer.close()?; - Ok(term_ord_mapping_opt) + Ok(()) } fn write_postings( @@ -630,8 +550,7 @@ impl IndexMerger { serializer: &mut InvertedIndexSerializer, fieldnorm_readers: FieldNormReaders, doc_id_mapping: &SegmentDocIdMapping, - ) -> crate::Result> { - let mut term_ordinal_mappings = HashMap::new(); + ) { for (field, field_entry) in self.schema.fields() { let fieldnorm_reader = fieldnorm_readers.get_field(field)?; if field_entry.is_indexed() { @@ -646,7 +565,7 @@ impl IndexMerger { } } } - Ok(term_ordinal_mappings) + Ok(()) } fn write_storable_fields( @@ -731,7 +650,7 @@ impl IndexMerger { if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? { self.get_doc_id_from_concatenated_data()? } else { - self.generate_doc_id_mapping(sort_by_field)? + self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)? } } else { self.get_doc_id_from_concatenated_data()?