From 791350091c3d2661f217704bee0294d980e79830 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 20 Oct 2022 19:44:19 +0800 Subject: [PATCH] switch num_vals() to u32 fixes #1630 --- fastfield_codecs/benches/bench.rs | 2 +- fastfield_codecs/src/bitpacked.rs | 2 +- fastfield_codecs/src/blockwise_linear.rs | 8 +++---- fastfield_codecs/src/column.rs | 18 +++++++-------- .../src/compact_space/build_compact_space.rs | 2 +- fastfield_codecs/src/compact_space/mod.rs | 22 +++++++++---------- fastfield_codecs/src/lib.rs | 2 +- fastfield_codecs/src/line.rs | 12 +++++----- fastfield_codecs/src/linear.rs | 2 +- fastfield_codecs/src/main.rs | 4 ++-- fastfield_codecs/src/serialize.rs | 10 ++++----- src/fastfield/multivalued/writer.rs | 8 +++---- src/fastfield/serializer/mod.rs | 2 +- src/fastfield/writer.rs | 10 ++++----- src/indexer/merger.rs | 4 ++-- src/indexer/sorted_doc_id_column.rs | 6 ++--- .../sorted_doc_id_multivalue_column.rs | 12 +++++----- 17 files changed, 63 insertions(+), 63 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 5546d2af7..f2fd6bdde 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -101,7 +101,7 @@ mod tests { fn get_u128_column_from_data(data: &[u128]) -> Arc> { let mut out = vec![]; let iter_gen = || data.iter().cloned(); - serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap(); + serialize_u128(iter_gen, data.len() as u32, &mut out).unwrap(); let out = OwnedBytes::new(out); open_u128::(out).unwrap() } diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index f91993a66..25416d947 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -30,7 +30,7 @@ impl Column for BitpackedReader { self.normalized_header.max_value } #[inline] - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.normalized_header.num_vals } } diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 360f5c3ad..c589d304e 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -36,7 +36,7 @@ impl BinarySerializable for Block { } } -fn compute_num_blocks(num_vals: u64) -> usize { +fn compute_num_blocks(num_vals: u32) -> usize { (num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE } @@ -72,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec { // Estimate first_chunk and extrapolate fn estimate(column: &dyn crate::Column) -> Option { - if column.num_vals() < 10 * CHUNK_SIZE as u64 { + if column.num_vals() < 10 * CHUNK_SIZE as u32 { return None; } let mut first_chunk: Vec = column.iter().take(CHUNK_SIZE as usize).collect(); @@ -95,7 +95,7 @@ impl FastFieldCodec for BlockwiseLinearCodec { }; let num_bits = estimated_bit_width as u64 * column.num_vals() as u64 // function metadata per block - + metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64); + + metadata_per_block as u64 * (column.num_vals() as u64 / CHUNK_SIZE as u64); let num_bits_uncompressed = 64 * column.num_vals(); Some(num_bits as f32 / num_bits_uncompressed as f32) } @@ -180,7 +180,7 @@ impl Column for BlockwiseLinearReader { self.normalized_header.max_value } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.normalized_header.num_vals } } diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 96621996e..864601a91 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -35,7 +35,7 @@ pub trait Column: Send + Sync { #[inline] fn get_between_vals(&self, range: RangeInclusive) -> Vec { let mut vals = Vec::new(); - for idx in 0..self.num_vals() { + for idx in 0..self.num_vals() as u64 { let val = self.get_val(idx); if range.contains(&val) { vals.push(idx); @@ -61,11 +61,11 @@ pub trait Column: Send + Sync { fn max_value(&self) -> T; /// The number of values in the column. - fn num_vals(&self) -> u64; + fn num_vals(&self) -> u32; /// Returns a iterator over the data fn iter<'a>(&'a self) -> Box + 'a> { - Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) + Box::new((0..self.num_vals() as u64).map(|idx| self.get_val(idx))) } } @@ -89,7 +89,7 @@ impl<'a, C: Column, T: Copy + PartialOrd> Column for &'a C { (*self).max_value() } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { (*self).num_vals() } @@ -119,8 +119,8 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { self.max_value } - fn num_vals(&self) -> u64 { - self.values.len() as u64 + fn num_vals(&self) -> u32 { + self.values.len() as u32 } fn get_range(&self, start: u64, output: &mut [T]) { @@ -203,7 +203,7 @@ where self.monotonic_mapping.mapping(from_max_value) } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.from_column.num_vals() } @@ -253,8 +253,8 @@ where self.0.clone().last().unwrap() } - fn num_vals(&self) -> u64 { - self.0.len() as u64 + fn num_vals(&self) -> u32 { + self.0.len() as u32 } fn iter(&self) -> Box + '_> { diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 02bbe97f8..90e14d3ef 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -57,7 +57,7 @@ fn num_bits(val: u128) -> u8 { /// metadata. pub fn get_compact_space( values_deduped_sorted: &BTreeSet, - total_num_values: u64, + total_num_values: u32, cost_per_blank: usize, ) -> CompactSpace { let mut compact_space_builder = CompactSpaceBuilder::new(); diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index dd6dfbdbb..b47bb5744 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -165,13 +165,13 @@ pub struct IPCodecParams { bit_unpacker: BitUnpacker, min_value: u128, max_value: u128, - num_vals: u64, + num_vals: u32, num_bits: u8, } impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. - pub fn train_from(iter: impl Iterator, num_vals: u64) -> Self { + pub fn train_from(iter: impl Iterator, num_vals: u32) -> Self { let mut values_sorted = BTreeSet::new(); values_sorted.extend(iter); let total_num_values = num_vals; @@ -200,7 +200,7 @@ impl CompactSpaceCompressor { bit_unpacker: BitUnpacker::new(num_bits), min_value, max_value, - num_vals: total_num_values as u64, + num_vals: total_num_values, num_bits, }, } @@ -267,7 +267,7 @@ impl BinarySerializable for IPCodecParams { let _header_flags = u64::deserialize(reader)?; let min_value = VIntU128::deserialize(reader)?.0; let max_value = VIntU128::deserialize(reader)?.0; - let num_vals = VIntU128::deserialize(reader)?.0 as u64; + let num_vals = VIntU128::deserialize(reader)?.0 as u32; let num_bits = u8::deserialize(reader)?; let compact_space = CompactSpace::deserialize(reader)?; @@ -296,7 +296,7 @@ impl Column for CompactSpaceDecompressor { self.max_value() } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.params.num_vals } @@ -378,7 +378,7 @@ impl CompactSpaceDecompressor { let mut positions = Vec::new(); let step_size = 4; - let cutoff = self.params.num_vals - self.params.num_vals % step_size; + let cutoff = self.params.num_vals as u64 - self.params.num_vals as u64 % step_size; let mut push_if_in_range = |idx, val| { if range.contains(&val) { @@ -403,7 +403,7 @@ impl CompactSpaceDecompressor { } // handle rest - for idx in cutoff..self.params.num_vals { + for idx in cutoff..self.params.num_vals as u64 { push_if_in_range(idx, get_val(idx)); } @@ -452,7 +452,7 @@ mod tests { ] .into_iter() .collect(); - let compact_space = get_compact_space(ips, ips.len() as u64, 11); + let compact_space = get_compact_space(ips, ips.len() as u32, 11); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 17); assert_eq!(1, compact_space.u128_to_compact(2).unwrap()); @@ -483,7 +483,7 @@ mod tests { #[test] fn compact_space_amplitude_test() { let ips = &[100000u128, 1000000].into_iter().collect(); - let compact_space = get_compact_space(ips, ips.len() as u64, 1); + let compact_space = get_compact_space(ips, ips.len() as u32, 1); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 2); } @@ -515,7 +515,7 @@ mod tests { let mut out = Vec::new(); serialize_u128( || u128_vals.iter().cloned(), - u128_vals.len() as u64, + u128_vals.len() as u32, &mut out, ) .unwrap(); @@ -608,7 +608,7 @@ mod tests { 5_000_000_000, ]; let mut out = Vec::new(); - serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap(); + serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap(); let decomp = open_u128::(OwnedBytes::new(out)).unwrap(); assert_eq!(decomp.get_between_vals(199..=200), vec![0]); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index c8f631fc8..25ca123e2 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -199,7 +199,7 @@ mod tests { let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let reader = crate::open::(OwnedBytes::new(out)).unwrap(); - assert_eq!(reader.num_vals(), data.len() as u64); + assert_eq!(reader.num_vals(), data.len() as u32); for (doc, orig_val) in data.iter().copied().enumerate() { let val = reader.get_val(doc as u64); assert_eq!( diff --git a/fastfield_codecs/src/line.rs b/fastfield_codecs/src/line.rs index 4119da9ee..4613faf04 100644 --- a/fastfield_codecs/src/line.rs +++ b/fastfield_codecs/src/line.rs @@ -1,5 +1,5 @@ use std::io; -use std::num::NonZeroU64; +use std::num::NonZeroU32; use common::{BinarySerializable, VInt}; @@ -29,7 +29,7 @@ pub struct Line { /// compute_slope(y0, y1) /// = compute_slope(y0 + X % 2^64, y1 + X % 2^64) /// ` -fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 { +fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 { let dy = y1.wrapping_sub(y0); let sign = dy <= (1 << 63); let abs_dy = if sign { @@ -43,7 +43,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU64) -> u64 { return 0u64; } - let abs_slope = (abs_dy << 32) / num_vals.get(); + let abs_slope = (abs_dy << 32) / num_vals.get() as u64; if sign { abs_slope } else { @@ -75,7 +75,7 @@ impl Line { Self::train_from( first_val, last_val, - num_vals, + num_vals as u32, sample_positions_and_values.iter().cloned(), ) } @@ -84,11 +84,11 @@ impl Line { fn train_from( first_val: u64, last_val: u64, - num_vals: u64, + num_vals: u32, positions_and_values: impl Iterator, ) -> Self { // TODO replace with let else - let idx_last_val = if let Some(idx_last_val) = NonZeroU64::new(num_vals - 1) { + let idx_last_val = if let Some(idx_last_val) = NonZeroU32::new(num_vals - 1) { idx_last_val } else { return Line::default(); diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index 6ed2d84b6..ad2a0ca74 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -37,7 +37,7 @@ impl Column for LinearReader { } #[inline] - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.header.num_vals } } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 8e81c41f5..988f41b1f 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -90,7 +90,7 @@ fn bench_ip() { { let mut data = vec![]; for dataset in dataset.chunks(500_000) { - serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap(); + serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression 50_000 chunks {:.4}", compression); @@ -103,7 +103,7 @@ fn bench_ip() { let mut data = vec![]; { print_time!("creation"); - serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap(); + serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 4bb2b295b..1f3041403 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -46,14 +46,14 @@ use crate::{ #[derive(Debug, Copy, Clone)] pub struct NormalizedHeader { /// The number of values in the underlying column. - pub num_vals: u64, + pub num_vals: u32, /// The max value of the underlying column. pub max_value: u64, } #[derive(Debug, Copy, Clone)] pub(crate) struct Header { - pub num_vals: u64, + pub num_vals: u32, pub min_value: u64, pub max_value: u64, pub gcd: Option, @@ -110,7 +110,7 @@ pub fn normalize_column( impl BinarySerializable for Header { fn serialize(&self, writer: &mut W) -> io::Result<()> { - VInt(self.num_vals).serialize(writer)?; + VInt(self.num_vals as u64).serialize(writer)?; VInt(self.min_value).serialize(writer)?; VInt(self.max_value - self.min_value).serialize(writer)?; if let Some(gcd) = self.gcd { @@ -123,7 +123,7 @@ impl BinarySerializable for Header { } fn deserialize(reader: &mut R) -> io::Result { - let num_vals = VInt::deserialize(reader)?.0; + let num_vals = VInt::deserialize(reader)?.0 as u32; let min_value = VInt::deserialize(reader)?.0; let amplitude = VInt::deserialize(reader)?.0; let max_value = min_value + amplitude; @@ -164,7 +164,7 @@ pub fn estimate( /// Serializes u128 values with the compact space codec. pub fn serialize_u128 I, I: Iterator>( iter_gen: F, - num_vals: u64, + num_vals: u32, output: &mut impl io::Write, ) -> io::Result<()> { // TODO write header, to later support more codecs diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 84cc0aa3b..446f21004 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -219,7 +219,7 @@ pub(crate) struct MultivalueStartIndex<'a, C: Column> { impl<'a, C: Column> MultivalueStartIndex<'a, C> { pub fn new(column: &'a C, doc_id_map: &'a DocIdMapping) -> Self { - assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u64 + 1); + assert_eq!(column.num_vals(), doc_id_map.num_old_doc_ids() as u32 + 1); let (min, max) = tantivy_bitpacker::minmax(iter_remapped_multivalue_index(doc_id_map, column)) .unwrap_or((0u64, 0u64)); @@ -244,8 +244,8 @@ impl<'a, C: Column> Column for MultivalueStartIndex<'a, C> { self.max } - fn num_vals(&self) -> u64 { - (self.doc_id_map.num_new_doc_ids() + 1) as u64 + fn num_vals(&self) -> u32 { + (self.doc_id_map.num_new_doc_ids() + 1) as u32 } fn iter(&self) -> Box + '_> { @@ -369,7 +369,7 @@ impl MultiValueU128FastFieldWriter { serializer.create_u128_fast_field_with_idx( self.field, iter_gen, - self.vals.len() as u64, + self.vals.len() as u32, 1, )?; } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index e0fb6e64b..060646361 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -90,7 +90,7 @@ impl CompositeFastFieldSerializer { &mut self, field: Field, iter_gen: F, - num_vals: u64, + num_vals: u32, idx: usize, ) -> io::Result<()> { let field_write = self.composite_write.for_field_with_idx(field, idx); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 8478008e9..f12027ad1 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -363,7 +363,7 @@ impl U128FastFieldWriter { serializer.create_u128_fast_field_with_idx( self.field, iter_gen, - self.val_count as u64, + self.val_count as u32, 0, )?; } else { @@ -371,7 +371,7 @@ impl U128FastFieldWriter { serializer.create_u128_fast_field_with_idx( self.field, iter_gen, - self.val_count as u64, + self.val_count as u32, 0, )?; } @@ -511,7 +511,7 @@ impl IntFastFieldWriter { vals: &self.vals, min_value: min, max_value: max, - num_vals: self.val_count as u64, + num_vals: self.val_count as u32, }; serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?; @@ -526,7 +526,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> { vals: &'bitp BlockedBitpacker, min_value: u64, max_value: u64, - num_vals: u64, + num_vals: u32, } impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> { @@ -562,7 +562,7 @@ impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> { self.max_value } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.num_vals } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b1963e674..bfe535c87 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -368,7 +368,7 @@ impl IndexMerger { fast_field_serializer.create_u128_fast_field_with_idx( field, iter_gen, - doc_id_mapping.len() as u64, + doc_id_mapping.len() as u32, 1, )?; @@ -403,7 +403,7 @@ impl IndexMerger { fast_field_serializer.create_u128_fast_field_with_idx( field, iter_gen, - doc_id_mapping.len() as u64, + doc_id_mapping.len() as u32, 0, )?; Ok(()) diff --git a/src/indexer/sorted_doc_id_column.rs b/src/indexer/sorted_doc_id_column.rs index c02fe1f16..3d5b36059 100644 --- a/src/indexer/sorted_doc_id_column.rs +++ b/src/indexer/sorted_doc_id_column.rs @@ -12,7 +12,7 @@ pub(crate) struct RemappedDocIdColumn<'a> { fast_field_readers: Vec>>, min_value: u64, max_value: u64, - num_vals: u64, + num_vals: u32, } fn compute_min_max_val( @@ -73,7 +73,7 @@ impl<'a> RemappedDocIdColumn<'a> { fast_field_readers, min_value, max_value, - num_vals: doc_id_mapping.len() as u64, + num_vals: doc_id_mapping.len() as u32, } } } @@ -102,7 +102,7 @@ impl<'a> Column for RemappedDocIdColumn<'a> { self.max_value } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.num_vals } } diff --git a/src/indexer/sorted_doc_id_multivalue_column.rs b/src/indexer/sorted_doc_id_multivalue_column.rs index 30e1beaba..650043f60 100644 --- a/src/indexer/sorted_doc_id_multivalue_column.rs +++ b/src/indexer/sorted_doc_id_multivalue_column.rs @@ -13,7 +13,7 @@ pub(crate) struct RemappedDocIdMultiValueColumn<'a> { fast_field_readers: Vec>, min_value: u64, max_value: u64, - num_vals: u64, + num_vals: u32, } impl<'a> RemappedDocIdMultiValueColumn<'a> { @@ -61,7 +61,7 @@ impl<'a> RemappedDocIdMultiValueColumn<'a> { fast_field_readers, min_value, max_value, - num_vals: num_vals as u64, + num_vals: num_vals as u32, } } } @@ -89,7 +89,7 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { self.max_value } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.num_vals } } @@ -99,7 +99,7 @@ pub(crate) struct RemappedDocIdMultiValueIndexColumn<'a, T: MultiValueLength> { multi_value_length_readers: Vec<&'a T>, min_value: u64, max_value: u64, - num_vals: u64, + num_vals: u32, } impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { @@ -123,7 +123,7 @@ impl<'a, T: MultiValueLength> RemappedDocIdMultiValueIndexColumn<'a, T> { max_value += multi_value_length_reader.get_len(doc); } } - num_vals += segment_reader.num_docs() as u64; + num_vals += segment_reader.num_docs(); multi_value_length_readers.push(multi_value_length_reader); } Self { @@ -162,7 +162,7 @@ impl<'a, T: MultiValueLength + Send + Sync> Column for RemappedDocIdMultiValueIn self.max_value } - fn num_vals(&self) -> u64 { + fn num_vals(&self) -> u32 { self.num_vals } }