Faster range (#1954)

* Faster range queries This PR does several changes - ip compact space now uses u32 - the bitunpacker now gets a get_batch function - we push down range filtering, removing GCD / shift in the bitpacking codec. - we rely on AVX2 routine to do the filtering. * Apply suggestions from code review * Apply suggestions from code review * CR comments
2026-01-06 17:22:54 +00:00 · 2023-03-27 14:56:32 +09:00
parent 2955e34452
commit 694a056255
10 changed files with 881 additions and 103 deletions
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -94,7 +94,6 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
    /// Get the row ids of values which are in the provided value range.
    ///
    /// Note that position == docid for single value fast fields
-    #[inline(always)]
    fn get_row_ids_for_value_range(
        &self,
        value_range: RangeInclusive<T>,
--- a/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
+++ b/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
@@ -10,7 +10,7 @@ use super::{CompactSpace, RangeMapping};
 /// Put the blanks for the sorted values into a binary heap
 fn get_blanks(values_sorted: &BTreeSet<u128>) -> BinaryHeap<BlankRange> {
    let mut blanks: BinaryHeap<BlankRange> = BinaryHeap::new();
-    for (first, second) in values_sorted.iter().tuple_windows() {
+    for (first, second) in values_sorted.iter().copied().tuple_windows() {
        // Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means
        // there's always space between two values.
        let blank_range = first + 1..=second - 1;
@@ -65,12 +65,12 @@ pub fn get_compact_space(
        return compact_space_builder.finish();
    }

-    let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
-    // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
-
    // We start by space that's limited to min_value..=max_value
-    let min_value = *values_deduped_sorted.iter().next().unwrap_or(&0);
-    let max_value = *values_deduped_sorted.iter().last().unwrap_or(&0);
+    // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
+    let min_value = values_deduped_sorted.iter().next().copied().unwrap_or(0);
+    let max_value = values_deduped_sorted.iter().last().copied().unwrap_or(0);
+
+    let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);

    // +1 for null, in case min and max covers the whole space, we are off by one.
    let mut amplitude_compact_space = (max_value - min_value).saturating_add(1);
@@ -84,6 +84,7 @@ pub fn get_compact_space(
    let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);

    let mut blank_collector = BlankCollector::new();
+
    // We will stage blanks until they reduce the compact space by at least 1 bit and then flush
    // them if the metadata cost is lower than the total number of saved bits.
    // Binary heap to process the gaps by their size
@@ -93,6 +94,7 @@ pub fn get_compact_space(
        let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum();
        let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum;
        let amplitude_new_bits = num_bits(amplitude_new_compact_space);
+
        if amplitude_bits == amplitude_new_bits {
            continue;
        }
@@ -100,7 +102,16 @@ pub fn get_compact_space(
        // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only,
        // when amplitude_new_bits changes
        let cost = blank_collector.num_staged_blanks() * cost_per_blank;
-        if cost >= saved_bits {
+
+        // We want to end up with a compact space that fits into 32 bits.
+        // In order to deal with pathological cases, we force the algorithm to keep
+        // refining the compact space the amplitude bits is lower than 32.
+        //
+        // The worst case scenario happens for a large number of u128s regularly
+        // spread over the full u128 space.
+        //
+        // This change will force the algorithm to degenerate into dictionary encoding.
+        if amplitude_bits <= 32 && cost >= saved_bits {
            // Continue here, since although we walk over the blanks by size,
            // we can potentially save a lot at the last bits, which are smaller blanks
            //
@@ -115,6 +126,8 @@ pub fn get_compact_space(
        compact_space_builder.add_blanks(blank_collector.drain().map(|blank| blank.blank_range()));
    }

+    assert!(amplitude_bits <= 32);
+
    // special case, when we don't collected any blanks because:
    // * the data is empty (early exit)
    // * the algorithm did decide it's not worth the cost, which can be the case for single values
@@ -199,7 +212,7 @@ impl CompactSpaceBuilder {
            covered_space.push(0..=0); // empty data case
        };

-        let mut compact_start: u64 = 1; // 0 is reserved for `null`
+        let mut compact_start: u32 = 1; // 0 is reserved for `null`
        let mut ranges_mapping: Vec<RangeMapping> = Vec::with_capacity(covered_space.len());
        for cov in covered_space {
            let range_mapping = super::RangeMapping {
@@ -218,6 +231,7 @@ impl CompactSpaceBuilder {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::column_values::u128_based::compact_space::COST_PER_BLANK_IN_BITS;

    #[test]
    fn test_binary_heap_pop_order() {
@@ -228,4 +242,11 @@ mod tests {
        assert_eq!(blanks.pop().unwrap().blank_size(), 101);
        assert_eq!(blanks.pop().unwrap().blank_size(), 11);
    }
+
+    #[test]
+    fn test_worst_case_scenario() {
+        let vals: BTreeSet<u128> = (0..8).map(|i| i * ((1u128 << 34) / 8)).collect();
+        let compact_space = get_compact_space(&vals, vals.len() as u32, COST_PER_BLANK_IN_BITS);
+        assert!(compact_space.amplitude_compact_space() < u32::MAX as u128);
+    }
 }
--- a/columnar/src/column_values/u128_based/compact_space/mod.rs
+++ b/columnar/src/column_values/u128_based/compact_space/mod.rs
@@ -42,15 +42,15 @@ pub struct CompactSpace {
 #[derive(Debug, Clone, Eq, PartialEq)]
 struct RangeMapping {
    value_range: RangeInclusive<u128>,
-    compact_start: u64,
+    compact_start: u32,
 }
 impl RangeMapping {
-    fn range_length(&self) -> u64 {
-        (self.value_range.end() - self.value_range.start()) as u64 + 1
+    fn range_length(&self) -> u32 {
+        (self.value_range.end() - self.value_range.start()) as u32 + 1
    }

    // The last value of the compact space in this range
-    fn compact_end(&self) -> u64 {
+    fn compact_end(&self) -> u32 {
        self.compact_start + self.range_length() - 1
    }
 }
@@ -81,7 +81,7 @@ impl BinarySerializable for CompactSpace {
        let num_ranges = VInt::deserialize(reader)?.0;
        let mut ranges_mapping: Vec<RangeMapping> = vec![];
        let mut value = 0u128;
-        let mut compact_start = 1u64; // 0 is reserved for `null`
+        let mut compact_start = 1u32; // 0 is reserved for `null`
        for _ in 0..num_ranges {
            let blank_delta_start = VIntU128::deserialize(reader)?.0;
            value += blank_delta_start;
@@ -122,10 +122,10 @@ impl CompactSpace {

    /// Returns either Ok(the value in the compact space) or if it is outside the compact space the
    /// Err(position where it would be inserted)
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
+    fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
        self.ranges_mapping
            .binary_search_by(|probe| {
-                let value_range = &probe.value_range;
+                let value_range: &RangeInclusive<u128> = &probe.value_range;
                if value < *value_range.start() {
                    Ordering::Greater
                } else if value > *value_range.end() {
@@ -136,13 +136,13 @@ impl CompactSpace {
            })
            .map(|pos| {
                let range_mapping = &self.ranges_mapping[pos];
-                let pos_in_range = (value - range_mapping.value_range.start()) as u64;
+                let pos_in_range: u32 = (value - range_mapping.value_range.start()) as u32;
                range_mapping.compact_start + pos_in_range
            })
    }

-    /// Unpacks a value from compact space u64 to u128 space
-    fn compact_to_u128(&self, compact: u64) -> u128 {
+    /// Unpacks a value from compact space u32 to u128 space
+    fn compact_to_u128(&self, compact: u32) -> u128 {
        let pos = self
            .ranges_mapping
            .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
@@ -178,11 +178,15 @@ impl CompactSpaceCompressor {
    /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
    pub fn train_from(iter: impl Iterator<Item = u128>) -> Self {
        let mut values_sorted = BTreeSet::new();
+        // Total number of values, with their redundancy.
        let mut total_num_values = 0u32;
        for val in iter {
            total_num_values += 1u32;
            values_sorted.insert(val);
        }
+        let min_value = *values_sorted.iter().next().unwrap_or(&0);
+        let max_value = *values_sorted.iter().last().unwrap_or(&0);
+
        let compact_space =
            get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
        let amplitude_compact_space = compact_space.amplitude_compact_space();
@@ -193,13 +197,12 @@ impl CompactSpaceCompressor {
        );

        let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
-        let min_value = *values_sorted.iter().next().unwrap_or(&0);
-        let max_value = *values_sorted.iter().last().unwrap_or(&0);
+
        assert_eq!(
            compact_space
                .u128_to_compact(max_value)
                .expect("could not convert max value to compact space"),
-            amplitude_compact_space as u64
+            amplitude_compact_space as u32
        );
        CompactSpaceCompressor {
            params: IPCodecParams {
@@ -240,7 +243,7 @@ impl CompactSpaceCompressor {
                        "Could not convert value to compact_space. This is a bug.",
                    )
                })?;
-            bitpacker.write(compact, self.params.num_bits, write)?;
+            bitpacker.write(compact as u64, self.params.num_bits, write)?;
        }
        bitpacker.close(write)?;
        self.write_footer(write)?;
@@ -314,48 +317,6 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {

    #[inline]
    fn get_row_ids_for_value_range(
-        &self,
-        value_range: RangeInclusive<u128>,
-        positions_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        self.get_positions_for_value_range(value_range, positions_range, positions)
-    }
-}
-
-impl CompactSpaceDecompressor {
-    pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
-        let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
-        let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;
-
-        let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
-        let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
-        let decompressor = CompactSpaceDecompressor { data, params };
-
-        Ok(decompressor)
-    }
-
-    /// Converting to compact space for the decompressor is more complex, since we may get values
-    /// which are outside the compact space. e.g. if we map
-    /// 1000 => 5
-    /// 2000 => 6
-    ///
-    /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
-    /// error with the index of the next range.
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
-        self.params.compact_space.u128_to_compact(value)
-    }
-
-    fn compact_to_u128(&self, compact: u64) -> u128 {
-        self.params.compact_space.compact_to_u128(compact)
-    }
-
-    /// Comparing on compact space: Random dataset 0,24 (50% random hit) - 1.05 GElements/s
-    /// Comparing on compact space: Real dataset 1.08 GElements/s
-    ///
-    /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
-    #[inline]
-    pub fn get_positions_for_value_range(
        &self,
        value_range: RangeInclusive<u128>,
        position_range: Range<u32>,
@@ -395,44 +356,42 @@ impl CompactSpaceDecompressor {
            range_mapping.compact_end()
        });

-        let range = compact_from..=compact_to;
+        let value_range = compact_from..=compact_to;
+        self.get_positions_for_compact_value_range(value_range, position_range, positions);
+    }
+}

-        let scan_num_docs = position_range.end - position_range.start;
+impl CompactSpaceDecompressor {
+    pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
+        let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
+        let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;

-        let step_size = 4;
-        let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size;
+        let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
+        let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
+        let decompressor = CompactSpaceDecompressor { data, params };

-        let mut push_if_in_range = |idx, val| {
-            if range.contains(&val) {
-                positions.push(idx);
-            }
-        };
-        let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data);
-        // unrolled loop
-        for idx in (position_range.start..cutoff).step_by(step_size as usize) {
-            let idx1 = idx;
-            let idx2 = idx + 1;
-            let idx3 = idx + 2;
-            let idx4 = idx + 3;
-            let val1 = get_val(idx1);
-            let val2 = get_val(idx2);
-            let val3 = get_val(idx3);
-            let val4 = get_val(idx4);
-            push_if_in_range(idx1, val1);
-            push_if_in_range(idx2, val2);
-            push_if_in_range(idx3, val3);
-            push_if_in_range(idx4, val4);
-        }
+        Ok(decompressor)
+    }

-        // handle rest
-        for idx in cutoff..position_range.end {
-            push_if_in_range(idx, get_val(idx));
-        }
+    /// Converting to compact space for the decompressor is more complex, since we may get values
+    /// which are outside the compact space. e.g. if we map
+    /// 1000 => 5
+    /// 2000 => 6
+    ///
+    /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
+    /// error with the index of the next range.
+    fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
+        self.params.compact_space.u128_to_compact(value)
+    }
+
+    fn compact_to_u128(&self, compact: u32) -> u128 {
+        self.params.compact_space.compact_to_u128(compact)
    }

    #[inline]
-    fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
-        (0..self.params.num_vals).map(move |idx| self.params.bit_unpacker.get(idx, &self.data))
+    fn iter_compact(&self) -> impl Iterator<Item = u32> + '_ {
+        (0..self.params.num_vals)
+            .map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u32)
    }

    #[inline]
@@ -445,7 +404,7 @@ impl CompactSpaceDecompressor {

    #[inline]
    pub fn get(&self, idx: u32) -> u128 {
-        let compact = self.params.bit_unpacker.get(idx, &self.data);
+        let compact = self.params.bit_unpacker.get(idx, &self.data) as u32;
        self.compact_to_u128(compact)
    }

@@ -456,6 +415,20 @@ impl CompactSpaceDecompressor {
    pub fn max_value(&self) -> u128 {
        self.params.max_value
    }
+
+    fn get_positions_for_compact_value_range(
+        &self,
+        value_range: RangeInclusive<u32>,
+        position_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        self.params.bit_unpacker.get_ids_for_value_range(
+            *value_range.start() as u64..=*value_range.end() as u64,
+            position_range,
+            &self.data,
+            positions,
+        );
+    }
 }

 #[cfg(test)]
@@ -469,12 +442,12 @@ mod tests {

    #[test]
    fn compact_space_test() {
-        let ips = &[
+        let ips: BTreeSet<u128> = [
            2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
        ]
        .into_iter()
        .collect();
-        let compact_space = get_compact_space(ips, ips.len() as u32, 11);
+        let compact_space = get_compact_space(&ips, ips.len() as u32, 11);
        let amplitude = compact_space.amplitude_compact_space();
        assert_eq!(amplitude, 17);
        assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -497,8 +470,8 @@ mod tests {
        );

        for ip in ips {
-            let compact = compact_space.u128_to_compact(*ip).unwrap();
-            assert_eq!(compact_space.compact_to_u128(compact), *ip);
+            let compact = compact_space.u128_to_compact(ip).unwrap();
+            assert_eq!(compact_space.compact_to_u128(compact), ip);
        }
    }

@@ -524,7 +497,7 @@ mod tests {
                    .map(|pos| pos as u32)
                    .collect::<Vec<_>>();
                let mut positions = Vec::new();
-                decompressor.get_positions_for_value_range(
+                decompressor.get_row_ids_for_value_range(
                    range,
                    0..decompressor.num_vals(),
                    &mut positions,
@@ -569,7 +542,7 @@ mod tests {
            let val = *val;
            let pos = pos as u32;
            let mut positions = Vec::new();
-            decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
+            decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
            assert_eq!(positions, vec![pos]);
        }

--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -1,4 +1,6 @@
 use std::io::{self, Write};
+use std::num::NonZeroU64;
+use std::ops::{Range, RangeInclusive};

 use common::{BinarySerializable, OwnedBytes};
 use fastdivide::DividerU64;
@@ -16,6 +18,46 @@ pub struct BitpackedReader {
    stats: ColumnStats,
 }

+#[inline(always)]
+const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
+    // copied from unstable rust standard library.
+    let d = n / q.get();
+    let r = n % q.get();
+    if r > 0 {
+        d + 1
+    } else {
+        d
+    }
+}
+
+// The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
+// f is defined by:
+// f: bitpacked -> stats.min_value + stats.gcd * bitpacked
+//
+// In order to run range queries, we invert the transformation.
+// `transform_range_before_linear_transformation` returns the range of values
+// [min_bipacked_value..max_bitpacked_value] such that
+// f(bitpacked) ∈ [min_value, max_value] <=> bitpacked ∈ [min_bitpacked_value, max_bitpacked_value]
+fn transform_range_before_linear_transformation(
+    stats: &ColumnStats,
+    range: RangeInclusive<u64>,
+) -> Option<RangeInclusive<u64>> {
+    if range.is_empty() {
+        return None;
+    }
+    if stats.min_value > *range.end() {
+        return None;
+    }
+    if stats.max_value < *range.start() {
+        return None;
+    }
+    let shifted_range =
+        range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
+    let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
+    let end_before_gcd_multiplication: u64 = *shifted_range.end() / stats.gcd;
+    Some(start_before_gcd_multiplication..=end_before_gcd_multiplication)
+}
+
 impl ColumnValues for BitpackedReader {
    #[inline(always)]
    fn get_val(&self, doc: u32) -> u64 {
@@ -34,6 +76,25 @@ impl ColumnValues for BitpackedReader {
    fn num_vals(&self) -> RowId {
        self.stats.num_rows
    }
+
+    fn get_row_ids_for_value_range(
+        &self,
+        range: RangeInclusive<u64>,
+        doc_id_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
+        else {
+            positions.clear();
+            return;
+        };
+        self.bit_unpacker.get_ids_for_value_range(
+            transformed_range,
+            doc_id_range,
+            &self.data,
+            positions,
+        );
+    }
 }

 fn num_bits(stats: &ColumnStats) -> u8 {