Adding a special method to Arc<dyn ColumnValues>

Fix clippy comments (#1872 )
2026-01-04 00:02:55 +00:00 · 2023-02-14 23:14:14 +09:00 · 2023-02-14 23:12:45 +09:00
36 changed files with 156 additions and 180 deletions
--- a/columnar/src/column/mod.rs
+++ b/columnar/src/column/mod.rs
@@ -84,7 +84,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {

        // Load rows
        self.values
-            .get_row_ids_for_value_range(value_range.clone(), rowid_range, docids);
+            .get_row_ids_for_value_range(value_range, rowid_range, docids);
        // Convert rows to docids
        self.idx
            .select_batch_in_place(docids, selected_docid_range.start);
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -93,11 +93,7 @@ fn iter_num_values<'a>(
        match column_index {
            ColumnIndex::Full => 1,
            ColumnIndex::Optional(optional_index) => {
-                if optional_index.contains(row_addr.row_id) {
-                    1u32
-                } else {
-                    0u32
-                }
+                u32::from(optional_index.contains(row_addr.row_id))
            }
            ColumnIndex::Multivalued(multivalued_index) => {
                multivalued_index.range(row_addr.row_id).len() as u32
--- a/columnar/src/column_index/mod.rs
+++ b/columnar/src/column_index/mod.rs
@@ -48,7 +48,7 @@ impl ColumnIndex {
            ColumnIndex::Full => true,
            ColumnIndex::Optional(optional_index) => optional_index.contains(doc_id),
            ColumnIndex::Multivalued(multivalued_index) => {
-                multivalued_index.range(doc_id).len() > 0
+                !multivalued_index.range(doc_id).is_empty()
            }
        }
    }
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -220,7 +220,7 @@ impl Set<RowId> for OptionalIndex {
        block_doc_idx_start + in_block_rank as u32
    }

-    fn select_cursor<'b>(&'b self) -> OptionalIndexSelectCursor<'b> {
+    fn select_cursor(&self) -> OptionalIndexSelectCursor<'_> {
        OptionalIndexSelectCursor {
            current_block_cursor: BlockSelectCursor::Sparse(
                SparseBlockCodec::open(b"").select_cursor(),
@@ -255,7 +255,7 @@ impl OptionalIndex {
        self.num_non_null_rows
    }

-    pub fn iter_rows<'a>(&'a self) -> impl Iterator<Item = RowId> + 'a {
+    pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
        // TODO optimize
        let mut select_batch = self.select_cursor();
        (0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
@@ -268,7 +268,7 @@ impl OptionalIndex {
    }

    #[inline]
-    fn block<'a>(&'a self, block_meta: BlockMeta) -> Block<'a> {
+    fn block(&self, block_meta: BlockMeta) -> Block<'_> {
        let BlockMeta {
            start_byte_offset,
            block_variant,
@@ -351,7 +351,7 @@ fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -
    Ok(())
 }

-pub fn serialize_optional_index<'a, W: io::Write>(
+pub fn serialize_optional_index<W: io::Write>(
    non_null_rows: &dyn Iterable<RowId>,
    num_rows: RowId,
    output: &mut W,
@@ -427,7 +427,7 @@ impl SerializedBlockMeta {
    }

    #[inline]
-    fn to_bytes(&self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
+    fn to_bytes(self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
        assert!(self.num_non_null_rows > 0);
        let mut bytes = [0u8; SERIALIZED_BLOCK_META_NUM_BYTES];
        bytes[0..2].copy_from_slice(&self.block_id.to_le_bytes());
@@ -501,7 +501,7 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
        num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
    let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
    let (block_metas, num_non_null_rows) =
-        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows).into();
+        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
    let optional_index = OptionalIndex {
        num_rows,
        num_non_null_rows,
--- a/columnar/src/column_index/optional_index/set.rs
+++ b/columnar/src/column_index/optional_index/set.rs
@@ -10,7 +10,7 @@ pub trait SetCodec {
    ///
    /// May panic if the elements are not sorted.
    fn serialize(els: impl Iterator<Item = Self::Item>, wrt: impl io::Write) -> io::Result<()>;
-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a>;
+    fn open(data: &[u8]) -> Self::Reader<'_>;
 }

 /// Stateful object that makes it possible to compute several select in a row,
@@ -43,5 +43,5 @@ pub trait Set<T> {
    fn select(&self, rank: T) -> T;

    /// Creates a brand new select cursor.
-    fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b>;
+    fn select_cursor(&self) -> Self::SelectCursor<'_>;
 }
--- a/columnar/src/column_index/optional_index/set_block/dense.rs
+++ b/columnar/src/column_index/optional_index/set_block/dense.rs
@@ -45,7 +45,7 @@ impl SetCodec for DenseBlockCodec {
    }

    #[inline]
-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
+    fn open(data: &[u8]) -> Self::Reader<'_> {
        assert_eq!(data.len(), DENSE_BLOCK_NUM_BYTES as usize);
        DenseBlock(data)
    }
@@ -94,7 +94,7 @@ impl DenseMiniBlock {
        Self { bitvec, rank }
    }

-    fn to_bytes(&self) -> [u8; MINI_BLOCK_NUM_BYTES] {
+    fn to_bytes(self) -> [u8; MINI_BLOCK_NUM_BYTES] {
        let mut bytes = [0u8; MINI_BLOCK_NUM_BYTES];
        bytes[..MINI_BLOCK_BITVEC_NUM_BYTES].copy_from_slice(&self.bitvec.to_le_bytes());
        bytes[MINI_BLOCK_BITVEC_NUM_BYTES..].copy_from_slice(&self.rank.to_le_bytes());
@@ -166,7 +166,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
    }

    #[inline(always)]
-    fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
+    fn select_cursor(&self) -> Self::SelectCursor<'_> {
        DenseBlockSelectCursor {
            block_id: 0,
            dense_block: *self,
--- a/columnar/src/column_index/optional_index/set_block/sparse.rs
+++ b/columnar/src/column_index/optional_index/set_block/sparse.rs
@@ -16,7 +16,7 @@ impl SetCodec for SparseBlockCodec {
        Ok(())
    }

-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
+    fn open(data: &[u8]) -> Self::Reader<'_> {
        SparseBlock(data)
    }
 }
@@ -56,7 +56,7 @@ impl<'a> Set<u16> for SparseBlock<'a> {
    }

    #[inline(always)]
-    fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
+    fn select_cursor(&self) -> Self::SelectCursor<'_> {
        *self
    }
 }
--- a/columnar/src/column_values/column.rs
+++ b/columnar/src/column_values/column.rs
@@ -110,31 +110,14 @@ impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>>
    fn get_range(&self, start: u64, output: &mut [T]) {
        self.as_ref().get_range(start, output)
    }
-}

-impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd + Debug> ColumnValues<T> for &'a C {
-    fn get_val(&self, idx: u32) -> T {
-        (*self).get_val(idx)
-    }
-
-    fn min_value(&self) -> T {
-        (*self).min_value()
-    }
-
-    fn max_value(&self) -> T {
-        (*self).max_value()
-    }
-
-    fn num_vals(&self) -> u32 {
-        (*self).num_vals()
-    }
-
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
-        (*self).iter()
-    }
-
-    fn get_range(&self, start: u64, output: &mut [T]) {
-        (*self).get_range(start, output)
+    fn get_row_ids_for_value_range(
+        &self,
+        value_range: RangeInclusive<T>,
+        row_id_range: Range<RowId>,
+        row_id_hits: &mut Vec<RowId>,
+    ) {
+        self.as_ref().get_row_ids_for_value_range(value_range, row_id_range, row_id_hits)
    }
 }

--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -1,5 +1,4 @@
 #![warn(missing_docs)]
-#![cfg_attr(all(feature = "unstable", test), feature(test))]

 //! # `fastfield_codecs`
 //!
@@ -26,10 +25,10 @@ mod stats;
 pub(crate) mod u64_based;

 mod column;
-pub mod serialize;
+pub(crate) mod serialize;

 pub use serialize::serialize_column_values_u128;
-pub use stats::Stats;
+pub use stats::ColumnStats;
 pub use u64_based::{
    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
@@ -137,7 +136,6 @@ mod bench {
    use test::{self, Bencher};

    use super::*;
-    use crate::column_values::serialize::NormalizedHeader;
    use crate::column_values::u64_based::*;

    fn get_data() -> Vec<u64> {
@@ -154,7 +152,7 @@ mod bench {
        data
    }

-    fn compute_stats(vals: impl Iterator<Item = u64>) -> Stats {
+    fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
        let mut stats_collector = StatsCollector::default();
        for val in vals {
            stats_collector.collect(val);
@@ -166,7 +164,7 @@ mod bench {
    fn value_iter() -> impl Iterator<Item = u64> {
        0..20_000
    }
-    fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::Reader {
+    fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
        let mut bytes = Vec::new();
        let stats = compute_stats(data.iter().cloned());
        let mut codec_serializer = Codec::estimator();
--- a/columnar/src/column_values/monotonic_mapping.rs
+++ b/columnar/src/column_values/monotonic_mapping.rs
@@ -123,6 +123,7 @@ pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval {
    min_value: u64,
 }
 impl StrictlyMonotonicMappingToInternalGCDBaseval {
+    /// Creates a linear mapping `x -> gcd*x + min_value`.
    pub(crate) fn new(gcd: u64, min_value: u64) -> Self {
        let gcd_divider = DividerU64::divide_by(gcd);
        Self {
@@ -151,7 +152,9 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
 pub(crate) struct StrictlyMonotonicMappingToInternalBaseval {
    min_value: u64,
 }
+
 impl StrictlyMonotonicMappingToInternalBaseval {
+    /// Creates a linear mapping `x -> x + min_value`.
    #[inline(always)]
    pub(crate) fn new(min_value: u64) -> Self {
        Self { min_value }
--- a/columnar/src/column_values/serialize.rs
+++ b/columnar/src/column_values/serialize.rs
@@ -8,19 +8,6 @@ use crate::column_values::U128FastFieldCodecType;
 use crate::iterable::Iterable;
 use crate::MonotonicallyMappableToU128;

-/// The normalized header gives some parameters after applying the following
-/// normalization of the vector:
-/// `val -> (val - min_value) / gcd`
-///
-/// By design, after normalization, `min_value = 0` and `gcd = 1`.
-#[derive(Debug, Copy, Clone)]
-pub struct NormalizedHeader {
-    /// The number of values in the underlying column.
-    pub num_vals: u32,
-    /// The max value of the underlying column.
-    pub max_value: u64,
-}
-
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub(crate) struct U128Header {
    pub num_vals: u32,
--- a/columnar/src/column_values/stats.rs
+++ b/columnar/src/column_values/stats.rs
@@ -6,21 +6,28 @@ use common::{BinarySerializable, VInt};

 use crate::RowId;

+/// Column statistics.
 #[derive(Debug, Clone, Eq, PartialEq)]
-pub struct Stats {
+pub struct ColumnStats {
+    /// GCD of the elements `el - min(column)`.
    pub gcd: NonZeroU64,
+    /// Minimum value of the column.
    pub min_value: u64,
+    /// Maximum value of the column.
    pub max_value: u64,
+    /// Number of rows in the column.
    pub num_rows: RowId,
 }

-impl Stats {
+impl ColumnStats {
+    /// Amplitude of value.
+    /// Difference between the maximum and the minimum value.
    pub fn amplitude(&self) -> u64 {
        self.max_value - self.min_value
    }
 }

-impl BinarySerializable for Stats {
+impl BinarySerializable for ColumnStats {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        VInt(self.min_value).serialize(writer)?;
        VInt(self.gcd.get()).serialize(writer)?;
@@ -37,7 +44,7 @@ impl BinarySerializable for Stats {
        let amplitude = VInt::deserialize(reader)?.0 * gcd.get();
        let max_value = min_value + amplitude;
        let num_rows = VInt::deserialize(reader)?.0 as RowId;
-        Ok(Stats {
+        Ok(ColumnStats {
            min_value,
            max_value,
            num_rows,
@@ -52,21 +59,21 @@ mod tests {

    use common::BinarySerializable;

-    use crate::column_values::Stats;
+    use crate::column_values::ColumnStats;

    #[track_caller]
-    fn test_stats_ser_deser_aux(stats: &Stats, num_bytes: usize) {
+    fn test_stats_ser_deser_aux(stats: &ColumnStats, num_bytes: usize) {
        let mut buffer: Vec<u8> = Vec::new();
        stats.serialize(&mut buffer).unwrap();
        assert_eq!(buffer.len(), num_bytes);
-        let deser_stats = Stats::deserialize(&mut &buffer[..]).unwrap();
+        let deser_stats = ColumnStats::deserialize(&mut &buffer[..]).unwrap();
        assert_eq!(stats, &deser_stats);
    }

    #[test]
    fn test_stats_serialization() {
        test_stats_ser_deser_aux(
-            &(Stats {
+            &(ColumnStats {
                gcd: NonZeroU64::new(3).unwrap(),
                min_value: 1,
                max_value: 3001,
@@ -75,7 +82,7 @@ mod tests {
            5,
        );
        test_stats_ser_deser_aux(
-            &(Stats {
+            &(ColumnStats {
                gcd: NonZeroU64::new(1_000).unwrap(),
                min_value: 1,
                max_value: 3001,
@@ -84,7 +91,7 @@ mod tests {
            5,
        );
        test_stats_ser_deser_aux(
-            &(Stats {
+            &(ColumnStats {
                gcd: NonZeroU64::new(1).unwrap(),
                min_value: 0,
                max_value: 0,
--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -4,7 +4,7 @@ use common::{BinarySerializable, OwnedBytes};
 use fastdivide::DividerU64;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

-use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
 use crate::{ColumnValues, RowId};

 /// Depending on the field type, a different
@@ -13,7 +13,7 @@ use crate::{ColumnValues, RowId};
 pub struct BitpackedReader {
    data: OwnedBytes,
    bit_unpacker: BitUnpacker,
-    stats: Stats,
+    stats: ColumnStats,
 }

 impl ColumnValues for BitpackedReader {
@@ -36,7 +36,7 @@ impl ColumnValues for BitpackedReader {
    }
 }

-fn num_bits(stats: &Stats) -> u8 {
+fn num_bits(stats: &ColumnStats) -> u8 {
    compute_num_bits(stats.amplitude() / stats.gcd)
 }

@@ -46,14 +46,14 @@ pub struct BitpackedCodecEstimator;
 impl ColumnCodecEstimator for BitpackedCodecEstimator {
    fn collect(&mut self, _value: u64) {}

-    fn estimate(&self, stats: &Stats) -> Option<u64> {
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
        let num_bits_per_value = num_bits(stats);
        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
    }

    fn serialize(
        &self,
-        stats: &Stats,
+        stats: &ColumnStats,
        vals: &mut dyn Iterator<Item = u64>,
        wrt: &mut dyn Write,
    ) -> io::Result<()> {
@@ -72,12 +72,12 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
 pub struct BitpackedCodec;

 impl ColumnCodec for BitpackedCodec {
-    type Reader = BitpackedReader;
+    type ColumnValues = BitpackedReader;
    type Estimator = BitpackedCodecEstimator;

    /// Opens a fast field given a file.
-    fn load(mut data: OwnedBytes) -> io::Result<Self::Reader> {
-        let stats = Stats::deserialize(&mut data)?;
+    fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut data)?;
        let num_bits = num_bits(&stats);
        let bit_unpacker = BitUnpacker::new(num_bits);
        Ok(BitpackedReader {
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -7,7 +7,7 @@ use fastdivide::DividerU64;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

 use crate::column_values::u64_based::line::Line;
-use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
 use crate::column_values::{ColumnValues, VecColumn};
 use crate::MonotonicallyMappableToU64;

@@ -84,7 +84,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
            self.block.clear();
        }
    }
-    fn estimate(&self, stats: &Stats) -> Option<u64> {
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
        let mut estimate = 4 + stats.num_bytes() + self.meta_num_bytes + self.values_num_bytes;
        if stats.gcd.get() > 1 {
            let estimate_gain_from_gcd =
@@ -100,7 +100,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {

    fn serialize(
        &self,
-        stats: &Stats,
+        stats: &ColumnStats,
        mut vals: &mut dyn Iterator<Item = u64>,
        wrt: &mut dyn Write,
    ) -> io::Result<()> {
@@ -165,12 +165,12 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
 pub struct BlockwiseLinearCodec;

 impl ColumnCodec<u64> for BlockwiseLinearCodec {
-    type Reader = BlockwiseLinearReader;
+    type ColumnValues = BlockwiseLinearReader;

    type Estimator = BlockwiseLinearEstimator;

-    fn load(mut bytes: OwnedBytes) -> io::Result<Self::Reader> {
-        let stats = Stats::deserialize(&mut bytes)?;
+    fn load(mut bytes: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut bytes)?;
        let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
        let footer_offset = bytes.len() - 4 - footer_len as usize;
        let (data, mut footer) = bytes.split(footer_offset);
@@ -195,7 +195,7 @@ impl ColumnCodec<u64> for BlockwiseLinearCodec {
 pub struct BlockwiseLinearReader {
    blocks: Arc<[Block]>,
    data: OwnedBytes,
-    stats: Stats,
+    stats: ColumnStats,
 }

 impl ColumnValues for BlockwiseLinearReader {
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -5,7 +5,7 @@ use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

 use super::line::Line;
 use super::ColumnValues;
-use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, Stats};
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
 use crate::column_values::VecColumn;
 use crate::RowId;

@@ -18,7 +18,7 @@ const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
 pub struct LinearReader {
    data: OwnedBytes,
    linear_params: LinearParams,
-    stats: Stats,
+    stats: ColumnStats,
 }

 impl ColumnValues for LinearReader {
@@ -106,7 +106,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
        }
    }

-    fn estimate(&self, stats: &Stats) -> Option<u64> {
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
        let line = self.line?;
        let amplitude = self.max_deviation - self.min_deviation;
        let num_bits = compute_num_bits(amplitude);
@@ -123,7 +123,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {

    fn serialize(
        &self,
-        stats: &Stats,
+        stats: &ColumnStats,
        vals: &mut dyn Iterator<Item = u64>,
        wrt: &mut dyn io::Write,
    ) -> io::Result<()> {
@@ -184,12 +184,12 @@ impl LinearCodecEstimator {
 }

 impl ColumnCodec for LinearCodec {
-    type Reader = LinearReader;
+    type ColumnValues = LinearReader;

    type Estimator = LinearCodecEstimator;

-    fn load(mut data: OwnedBytes) -> io::Result<Self::Reader> {
-        let stats = Stats::deserialize(&mut data)?;
+    fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut data)?;
        let linear_params = LinearParams::deserialize(&mut data)?;
        Ok(LinearReader {
            stats,
--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -17,31 +17,57 @@ pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
 pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
 pub use crate::column_values::u64_based::linear::LinearCodec;
 pub use crate::column_values::u64_based::stats_collector::StatsCollector;
-use crate::column_values::{monotonic_map_column, Stats};
+use crate::column_values::{monotonic_map_column, ColumnStats};
 use crate::iterable::Iterable;
 use crate::{ColumnValues, MonotonicallyMappableToU64};

+/// A `ColumnCodecEstimator` is in charge of gathering all
+/// data required to serialize a column.
+///
+/// This happens during a first pass on data of the column elements.
+/// During that pass, all column estimators receive a call to their
+/// `.collect(el)`.
+///
+/// After this first pass, finalize is called.
+/// `.estimate(..)` then should return an accurate estimation of the
+/// size of the serialized column (were we to pick this codec.).
+/// `.serialize(..)` then serializes the column using this codec.
 pub trait ColumnCodecEstimator<T = u64>: 'static {
+    /// Records a new value for estimation.
+    /// This method will be called for each element of the column during
+    /// `estimation`.
    fn collect(&mut self, value: u64);
-    fn estimate(&self, stats: &Stats) -> Option<u64>;
+    /// Finalizes the first pass phase.
    fn finalize(&mut self) {}
+    /// Returns an accurate estimation of the number of bytes that will
+    /// be used to represent this column.
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64>;
+    /// Serializes the column using the given codec.
+    /// This constitutes a second pass over the columns values.
    fn serialize(
        &self,
-        stats: &Stats,
+        stats: &ColumnStats,
        vals: &mut dyn Iterator<Item = T>,
        wrt: &mut dyn io::Write,
    ) -> io::Result<()>;
 }

+/// A column codec describes a colunm serialization format.
 pub trait ColumnCodec<T: PartialOrd = u64> {
-    type Reader: ColumnValues<T> + 'static;
+    /// Specialized `ColumnValues` type.
+    type ColumnValues: ColumnValues<T> + 'static;
+    /// `Estimator` for the given codec.
    type Estimator: ColumnCodecEstimator + Default;

-    fn load(bytes: OwnedBytes) -> io::Result<Self::Reader>;
+    /// Loads a column that has been serialized using this codec.
+    fn load(bytes: OwnedBytes) -> io::Result<Self::ColumnValues>;

+    /// Returns an estimator.
    fn estimator() -> Self::Estimator {
        Self::Estimator::default()
    }
+
+    /// Returns a boxed estimator.
    fn boxed_estimator() -> Box<dyn ColumnCodecEstimator> {
        Box::new(Self::estimator())
    }
@@ -62,6 +88,7 @@ pub enum CodecType {
    BlockwiseLinear = 2u8,
 }

+/// List of all available u64-base codecs.
 pub const ALL_U64_CODEC_TYPES: [CodecType; 3] = [
    CodecType::Bitpacked,
    CodecType::Linear,
@@ -106,6 +133,7 @@ fn load_specific_codec<C: ColumnCodec, T: MonotonicallyMappableToU64>(
 }

 impl CodecType {
+    /// Returns a boxed codec estimator associated to a given `CodecType`.
    pub fn estimator(&self) -> Box<dyn ColumnCodecEstimator> {
        match self {
            CodecType::Bitpacked => BitpackedCodec::boxed_estimator(),
@@ -115,7 +143,8 @@ impl CodecType {
    }
 }

-pub fn serialize_u64_based_column_values<'a, T: MonotonicallyMappableToU64>(
+/// Serializes a given column of u64-mapped values.
+pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
    vals: &dyn Iterable<T>,
    codec_types: &[CodecType],
    wrt: &mut dyn Write,
@@ -156,11 +185,14 @@ pub fn serialize_u64_based_column_values<'a, T: MonotonicallyMappableToU64>(
    Ok(())
 }

+/// Load u64-based column values.
+///
+/// This method first identifies the codec off the first byte.
 pub fn load_u64_based_column_values<T: MonotonicallyMappableToU64>(
    mut bytes: OwnedBytes,
 ) -> io::Result<Arc<dyn ColumnValues<T>>> {
    let codec_type: CodecType = bytes
-        .get(0)
+        .first()
        .copied()
        .and_then(CodecType::try_from_code)
        .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to read codec type"))?;
--- a/columnar/src/column_values/u64_based/stats_collector.rs
+++ b/columnar/src/column_values/u64_based/stats_collector.rs
@@ -2,7 +2,7 @@ use std::num::NonZeroU64;

 use fastdivide::DividerU64;

-use crate::column_values::Stats;
+use crate::column_values::ColumnStats;
 use crate::RowId;

 /// Compute the gcd of two non null numbers.
@@ -33,14 +33,14 @@ pub struct StatsCollector {
 }

 impl StatsCollector {
-    pub fn stats(&self) -> Stats {
+    pub fn stats(&self) -> ColumnStats {
        let (min_value, max_value) = self.min_max_opt.unwrap_or((0u64, 0u64));
        let increment_gcd = if let Some((increment_gcd, _)) = self.increment_gcd_opt {
            increment_gcd
        } else {
            NonZeroU64::new(1u64).unwrap()
        };
-        Stats {
+        ColumnStats {
            min_value,
            max_value,
            num_rows: self.num_rows,
@@ -97,9 +97,9 @@ mod tests {
    use std::num::NonZeroU64;

    use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
-    use crate::column_values::u64_based::Stats;
+    use crate::column_values::u64_based::ColumnStats;

-    fn compute_stats(vals: impl Iterator<Item = u64>) -> Stats {
+    fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
        let mut stats_collector = StatsCollector::default();
        for val in vals {
            stats_collector.collect(val);
@@ -144,7 +144,7 @@ mod tests {
    fn test_stats() {
        assert_eq!(
            compute_stats([].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(1).unwrap(),
                min_value: 0,
                max_value: 0,
@@ -153,7 +153,7 @@ mod tests {
        );
        assert_eq!(
            compute_stats([0, 1].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(1).unwrap(),
                min_value: 0,
                max_value: 1,
@@ -162,7 +162,7 @@ mod tests {
        );
        assert_eq!(
            compute_stats([0, 1].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(1).unwrap(),
                min_value: 0,
                max_value: 1,
@@ -171,7 +171,7 @@ mod tests {
        );
        assert_eq!(
            compute_stats([10, 20, 30].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(10).unwrap(),
                min_value: 10,
                max_value: 30,
@@ -180,7 +180,7 @@ mod tests {
        );
        assert_eq!(
            compute_stats([10, 50, 10, 30].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(20).unwrap(),
                min_value: 10,
                max_value: 50,
@@ -189,7 +189,7 @@ mod tests {
        );
        assert_eq!(
            compute_stats([10, 0, 30].into_iter()),
-            Stats {
+            ColumnStats {
                gcd: NonZeroU64::new(10).unwrap(),
                min_value: 0,
                max_value: 30,
--- a/columnar/src/columnar/format_version.rs
+++ b/columnar/src/columnar/format_version.rs
@@ -4,7 +4,7 @@ pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_o

 /// We end the file by these 4 bytes just to somewhat identify that
 /// this is indeed a columnar file.
-const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 066];
+const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];

 pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
    let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
@@ -27,8 +27,8 @@ pub enum Version {
 }

 impl Version {
-    fn to_bytes(&self) -> [u8; 4] {
-        (*self as u32).to_le_bytes()
+    fn to_bytes(self) -> [u8; 4] {
+        (self as u32).to_le_bytes()
    }

    fn try_from_bytes(bytes: [u8; 4]) -> Result<Version, InvalidData> {
--- a/columnar/src/columnar/merge/merge_dict_column.rs
+++ b/columnar/src/columnar/merge/merge_dict_column.rs
@@ -58,7 +58,7 @@ impl<'a> RemappedTermOrdinalsValues<'a> {
            .enumerate()
            .flat_map(|(segment_ord, byte_column)| {
                let segment_ord = self.term_ord_mapping.get_segment(segment_ord as u32);
-                byte_column.into_iter().flat_map(move |bytes_column| {
+                byte_column.iter().flat_map(move |bytes_column| {
                    bytes_column
                        .ords()
                        .values
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -174,6 +174,7 @@ fn merge_column(
    Ok(())
 }

+#[allow(clippy::type_complexity)]
 fn group_columns_for_merge(
    columnar_readers: &[&ColumnarReader],
 ) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {
--- a/columnar/src/columnar/reader/mod.rs
+++ b/columnar/src/columnar/reader/mod.rs
@@ -162,7 +162,7 @@ mod tests {
    }

    #[test]
-    #[should_panic(expect = "Input type forbidden")]
+    #[should_panic(expected = "Input type forbidden")]
    fn test_list_columns_strict_typing_panics_on_wrong_types() {
        let mut columnar_writer = ColumnarWriter::default();
        columnar_writer.record_column_type("count", ColumnType::U64, false);
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -47,6 +47,7 @@ struct SpareBuffers {
 /// let mut wrt: Vec<u8> =  Vec::new();
 /// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
 /// ```
+#[derive(Default)]
 pub struct ColumnarWriter {
    numerical_field_hash_map: ArenaHashMap,
    datetime_field_hash_map: ArenaHashMap,
@@ -60,22 +61,6 @@ pub struct ColumnarWriter {
    buffers: SpareBuffers,
 }

-impl Default for ColumnarWriter {
-    fn default() -> Self {
-        ColumnarWriter {
-            numerical_field_hash_map: ArenaHashMap::new(),
-            bool_field_hash_map: ArenaHashMap::new(),
-            ip_addr_field_hash_map: ArenaHashMap::new(),
-            bytes_field_hash_map: ArenaHashMap::new(),
-            str_field_hash_map: ArenaHashMap::new(),
-            datetime_field_hash_map: ArenaHashMap::new(),
-            dictionaries: Vec::new(),
-            arena: MemoryArena::default(),
-            buffers: SpareBuffers::default(),
-        }
-    }
-}
-
 #[inline]
 fn mutate_or_create_column<V, TMutator>(
    arena_hash_map: &mut ArenaHashMap,
@@ -671,7 +656,7 @@ where
    Ok(())
 }

-fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut Vec<u64>) {
+fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
    let mut start_index: usize = 0;
    for end_index in multivalued_index.iter().copied() {
        let end_index = end_index as usize;
--- a/columnar/src/columnar/writer/value_index.rs
+++ b/columnar/src/columnar/writer/value_index.rs
@@ -29,7 +29,7 @@ pub struct OptionalIndexBuilder {
 }

 impl OptionalIndexBuilder {
-    pub fn finish<'a>(&'a mut self, num_rows: RowId) -> impl Iterable<RowId> + 'a {
+    pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
        debug_assert!(self
            .docs
            .last()
--- a/columnar/src/dynamic_column.rs
+++ b/columnar/src/dynamic_column.rs
@@ -166,9 +166,9 @@ impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {

 macro_rules! static_dynamic_conversions {
    ($typ:ty, $enum_name:ident) => {
-        impl Into<Option<$typ>> for DynamicColumn {
-            fn into(self) -> Option<$typ> {
-                if let DynamicColumn::$enum_name(col) = self {
+        impl From<DynamicColumn> for Option<$typ> {
+            fn from(dynamic_column: DynamicColumn) -> Option<$typ> {
+                if let DynamicColumn::$enum_name(col) = dynamic_column {
                    Some(col)
                } else {
                    None
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -66,7 +66,7 @@ impl BucketAggregationWithAccessor {
            BucketAggregationType::Terms(TermsAggregation {
                field: field_name, ..
            }) => {
-                str_dict_column = reader.fast_fields().str(&field_name)?;
+                str_dict_column = reader.fast_fields().str(field_name)?;
                get_ff_reader_and_validate(reader, field_name)?
            }
        };
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -74,9 +74,9 @@ use crate::{DocId, TantivyError};
 ///     ...
 ///     "aggregations": {
 ///         "genres": {
-///             "doc_count_error_upper_bound": 0,   
-///             "sum_other_doc_count": 0,           
-///             "buckets": [                        
+///             "doc_count_error_upper_bound": 0,
+///             "sum_other_doc_count": 0,
+///             "buckets": [
 ///                 { "key": "drumnbass", "doc_count": 6 },
 ///                 { "key": "raggae", "doc_count": 4 },
 ///                 { "key": "jazz", "doc_count": 2 }
@@ -241,15 +241,6 @@ impl TermBucketEntry {
 }

 impl TermBuckets {
-    pub(crate) fn from_req_and_validate(
-        _sub_aggregation: &AggregationsWithAccessor,
-        _max_term_id: usize,
-    ) -> crate::Result<Self> {
-        Ok(TermBuckets {
-            entries: Default::default(),
-        })
-    }
-
    fn force_flush(&mut self, agg_with_accessor: &AggregationsWithAccessor) -> crate::Result<()> {
        for entry in &mut self.entries.values_mut() {
            if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() {
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -196,6 +196,7 @@ impl MmapDirectory {
                directory_path,
            )));
        }
+        #[allow(clippy::bind_instead_of_map)]
        let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
            let directory_path = directory_path.to_owned();

--- a/src/fastfield/alive_bitset.rs
+++ b/src/fastfield/alive_bitset.rs
@@ -49,11 +49,6 @@ impl AliveBitSet {
        Self::open(alive_bitset_bytes)
    }

-    pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
-        let readonly_bitset = ReadOnlyBitSet::from(bitset);
-        AliveBitSet::from(readonly_bitset)
-    }
-
    /// Opens an alive bitset given its file.
    pub fn open(bytes: OwnedBytes) -> AliveBitSet {
        let bitset = ReadOnlyBitSet::open(bytes);
--- a/src/fastfield/facet_reader.rs
+++ b/src/fastfield/facet_reader.rs
@@ -54,6 +54,7 @@ impl FacetReader {
        self.facet_column.ords().values(doc)
    }

+    /// Accessor to the facet dictionary.
    pub fn facet_dict(&self) -> &columnar::Dictionary {
        self.facet_column.dictionary()
    }
--- a/src/fastfield/readers.rs
+++ b/src/fastfield/readers.rs
@@ -156,8 +156,7 @@ impl FastFieldReaders {
            .columnar
            .read_columns(field_name)?
            .into_iter()
-            .filter(|column| column.column_type() == column_type)
-            .next();
+            .find(|column| column.column_type() == column_type);
        Ok(dynamic_column_handle_opt)
    }

--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -126,7 +126,7 @@ impl FastFieldsWriter {
                        self.columnar_writer.record_datetime(
                            doc_id,
                            field_name.as_str(),
-                            truncated_datetime.into(),
+                            truncated_datetime,
                        );
                    }
                    Value::Facet(facet) => {
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -110,8 +110,8 @@ impl DeltaComputer {
    }
 }

-fn convert_to_merge_order<'a>(
-    columnars: &[&'a ColumnarReader],
+fn convert_to_merge_order(
+    columnars: &[&ColumnarReader],
    doc_id_mapping: SegmentDocIdMapping,
 ) -> MergeRowOrder {
    match doc_id_mapping.mapping_type() {
@@ -369,11 +369,8 @@ impl IndexMerger {
            .readers
            .iter()
            .map(|segment_reader| {
-                if let Some(alive_bitset) = segment_reader.alive_bitset() {
-                    Some(alive_bitset.bitset().clone())
-                } else {
-                    None
-                }
+                let alive_bitset = segment_reader.alive_bitset()?;
+                Some(alive_bitset.bitset().clone())
            })
            .collect();
        Ok(SegmentDocIdMapping::new(
@@ -416,11 +413,8 @@ impl IndexMerger {
            .readers
            .iter()
            .map(|reader| {
-                if let Some(bitset) = reader.alive_bitset() {
-                    Some(bitset.bitset().clone())
-                } else {
-                    None
-                }
+                let alive_bitset = reader.alive_bitset()?;
+                Some(alive_bitset.bitset().clone())
            })
            .collect();
        Ok(SegmentDocIdMapping::new(
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -334,7 +334,7 @@ impl SegmentWriter {
    /// As a user, you should rather use `IndexWriter`'s add_document.
    pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
        let AddOperation { document, opstamp } = add_operation;
-        self.doc_opstamps.push(add_operation.opstamp);
+        self.doc_opstamps.push(opstamp);
        self.fast_field_writers.add_document(&document)?;
        self.index_document(&document)?;
        let doc_writer = self.segment_serializer.get_store_writer();
--- a/src/tokenizer/facet_tokenizer.rs
+++ b/src/tokenizer/facet_tokenizer.rs
@@ -27,8 +27,10 @@ pub struct FacetTokenStream<'a> {

 impl Tokenizer for FacetTokenizer {
    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
-        let mut token = Token::default();
-        token.position = 0;
+        let token = Token {
+            position: 0,
+            ..Default::default()
+        };
        FacetTokenStream {
            text,
            state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
--- a/sstable/src/dictionary.rs
+++ b/sstable/src/dictionary.rs
@@ -48,8 +48,7 @@ impl Dictionary<VoidSSTable> {
            dictionary_writer.insert(term, &()).unwrap();
        }
        dictionary_writer.finish().unwrap();
-        let dictionary = Dictionary::from_bytes(OwnedBytes::new(buffer)).unwrap();
-        dictionary
+        Dictionary::from_bytes(OwnedBytes::new(buffer)).unwrap()
    }
 }

--- a/stacker/src/arena_hashmap.rs
+++ b/stacker/src/arena_hashmap.rs
@@ -103,8 +103,8 @@ fn compute_previous_power_of_two(n: usize) -> usize {
    1 << msb
 }

-impl ArenaHashMap {
-    pub fn new() -> ArenaHashMap {
+impl Default for ArenaHashMap {
+    fn default() -> Self {
        let memory_arena = MemoryArena::default();
        ArenaHashMap {
            table: Box::new([]),
@@ -114,7 +114,9 @@ impl ArenaHashMap {
            len: 0,
        }
    }
+}

+impl ArenaHashMap {
    pub fn with_capacity(table_size: usize) -> ArenaHashMap {
        let table_size_power_of_2 = compute_previous_power_of_two(table_size);
        let memory_arena = MemoryArena::default();
@@ -298,7 +300,7 @@ mod tests {

    #[test]
    fn test_hash_map() {
-        let mut hash_map: ArenaHashMap = ArenaHashMap::new();
+        let mut hash_map: ArenaHashMap = ArenaHashMap::default();
        hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
            assert_eq!(opt_val, None);
            3u32
Author	SHA1	Message	Date
Paul Masurel	7b06db062b	Adding a special method to Arc<dyn ColumnValues>	2023-02-14 23:14:14 +09:00
Paul Masurel	097fd6138d	Fix clippy comments (#1872 )	2023-02-14 23:12:45 +09:00