Merge pull request #1493 from quickwit-oss/remove_vec_impl

remove Column impl on Vec
2026-01-07 17:42:55 +00:00 · 2022-08-29 07:54:33 -07:00
parent 5d436759b0 7a26cc9022
commit f740ddeee3
3 changed files with 45 additions and 30 deletions
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -90,45 +90,32 @@ pub struct FastFieldStats {
    pub num_vals: u64,
 }

-impl<'a> Column for &'a [u64] {
+struct VecColum<'a>(&'a [u64]);
+impl<'a> Column for VecColum<'a> {
    fn get_val(&self, position: u64) -> u64 {
-        self[position as usize]
+        self.0[position as usize]
    }

    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new((self as &[u64]).iter().cloned())
+        Box::new(self.0.iter().cloned())
    }

    fn min_value(&self) -> u64 {
-        self.iter().min().unwrap_or(0)
+        self.0.iter().min().cloned().unwrap_or(0)
    }

    fn max_value(&self) -> u64 {
-        self.iter().max().unwrap_or(0)
+        self.0.iter().max().cloned().unwrap_or(0)
    }

    fn num_vals(&self) -> u64 {
-        self.len() as u64
+        self.0.len() as u64
    }
 }

-impl Column for Vec<u64> {
-    fn get_val(&self, position: u64) -> u64 {
-        self[position as usize]
-    }
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new((self as &[u64]).iter().cloned())
-    }
-    fn min_value(&self) -> u64 {
-        self.iter().min().unwrap_or(0)
-    }
-
-    fn max_value(&self) -> u64 {
-        self.iter().max().unwrap_or(0)
-    }
-
-    fn num_vals(&self) -> u64 {
-        self.len() as u64
+impl<'a> From<&'a [u64]> for VecColum<'a> {
+    fn from(data: &'a [u64]) -> Self {
+        Self(data)
    }
 }

@@ -145,10 +132,10 @@ mod tests {
        data: &[u64],
        name: &str,
    ) -> Option<(f32, f32)> {
-        let estimation = Codec::estimate(&data)?;
+        let estimation = Codec::estimate(&VecColum::from(data))?;

        let mut out: Vec<u8> = Vec::new();
-        Codec::serialize(&mut out, &data).unwrap();
+        Codec::serialize(&mut out, &VecColum::from(data)).unwrap();

        let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

@@ -234,6 +221,7 @@ mod tests {
    #[test]
    fn estimation_good_interpolation_case() {
        let data = (10..=20000_u64).collect::<Vec<_>>();
+        let data: VecColum = data.as_slice().into();

        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.01);
@@ -247,8 +235,9 @@ mod tests {
    }
    #[test]
    fn estimation_test_bad_interpolation_case() {
-        let data = vec![200, 10, 10, 10, 10, 1000, 20];
+        let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];

+        let data: VecColum = data.into();
        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.32);

@@ -259,6 +248,7 @@ mod tests {
    fn estimation_test_bad_interpolation_case_monotonically_increasing() {
        let mut data: Vec<u64> = (200..=20000_u64).collect();
        data.push(1_000_000);
+        let data: VecColum = data.as_slice().into();

        // in this case the linear interpolation can't in fact not be worse than bitpacking,
        // but the estimator adds some threshold, which leads to estimated worse behavior
--- a/fastfield_codecs/src/main.rs
+++ b/fastfield_codecs/src/main.rs
@@ -3,9 +3,33 @@ extern crate prettytable;
 use fastfield_codecs::bitpacked::BitpackedCodec;
 use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
 use fastfield_codecs::linear::LinearCodec;
-use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
+use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
 use prettytable::{Cell, Row, Table};

+struct Data<'a>(&'a [u64]);
+
+impl<'a> Column for Data<'a> {
+    fn get_val(&self, position: u64) -> u64 {
+        self.0[position as usize]
+    }
+
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new(self.0.iter().cloned())
+    }
+
+    fn min_value(&self) -> u64 {
+        *self.0.iter().min().unwrap_or(&0)
+    }
+
+    fn max_value(&self) -> u64 {
+        *self.0.iter().max().unwrap_or(&0)
+    }
+
+    fn num_vals(&self) -> u64 {
+        self.0.len() as u64
+    }
+}
+
 fn main() {
    let mut table = Table::new();

@@ -86,10 +110,11 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
 pub fn serialize_with_codec<C: FastFieldCodec>(
    data: &[u64],
 ) -> Option<(f32, f32, FastFieldCodecType)> {
+    let data = Data(data);
    let estimation = C::estimate(&data)?;
    let mut out = Vec::new();
    C::serialize(&mut out, &data).unwrap();
-    let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
+    let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
    Some((estimation, actual_compression, C::CODEC_TYPE))
 }

--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
    fn max_term_ord(&self) -> TermOrdinal {
        self.per_segment_new_term_ordinals
            .iter()
-            .flat_map(|term_ordinals| term_ordinals.iter().max())
+            .flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
            .max()
            .unwrap_or_default()
    }
@@ -784,7 +784,7 @@ impl IndexMerger {
                let new_doc_id: DocId =
                    self.offsets
                        .iter()
-                        .position(|offset| offset > pos)
+                        .position(|&offset| offset > pos)
                        .expect("pos is out of bounds") as DocId
                        - 1u32;