chore: Release 0.19.2

fix: doc store for files larger 4GB
Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32. `get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB). Causes index corruption, if a segment with a doc store larger 4GB is merged. tantivy version 0.19 is affected
2025-12-28 13:02:55 +00:00 · 2023-02-10 12:20:20 +08:00 · 2023-02-10 12:12:47 +08:00 · 2023-01-13 13:46:28 +08:00 · 2023-01-13 13:34:42 +08:00 · 2023-01-13 13:31:20 +08:00
14 changed files with 255 additions and 509 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.19.0"
+version = "0.19.2"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -60,7 +60,7 @@ arc-swap = "1.5.0"
 tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
 tantivy-bitpacker = 		{ version= "0.3", path="./bitpacker" }
 common = 								{ version= "0.4", path = "./common/", package = "tantivy-common" }
-fastfield_codecs = 			{ version= "0.3", path="./fastfield_codecs", default-features = false }
+fastfield_codecs = 			{ version= "0.3.1", path="./fastfield_codecs", default-features = false }
 ownedbytes = 						{ version= "0.4", path="./ownedbytes" }

 [target.'cfg(windows)'.dependencies]
--- a/fastfield_codecs/Cargo.toml
+++ b/fastfield_codecs/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fastfield_codecs"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
 edition = "2021"
--- a/fastfield_codecs/src/column.rs
+++ b/fastfield_codecs/src/column.rs
@@ -1,3 +1,4 @@
+use std::fmt::{self, Debug};
 use std::marker::PhantomData;
 use std::ops::{Range, RangeInclusive};

@@ -6,7 +7,7 @@ use tantivy_bitpacker::minmax;
 use crate::monotonic_mapping::StrictlyMonotonicFn;

 /// `Column` provides columnar access on a field.
-pub trait Column<T: PartialOrd = u64>: Send + Sync {
+pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
    /// Return the value associated with the given idx.
    ///
    /// This accessor should return as fast as possible.
@@ -83,7 +84,7 @@ pub struct VecColumn<'a, T = u64> {
    max_value: T,
 }

-impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
+impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
    fn get_val(&self, idx: u32) -> T {
        (*self).get_val(idx)
    }
@@ -109,7 +110,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
    }
 }

-impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
+impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
    fn get_val(&self, position: u32) -> T {
        self.values[position as usize]
    }
@@ -177,8 +178,8 @@ pub fn monotonic_map_column<C, T, Input, Output>(
 where
    C: Column<Input>,
    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
-    Input: PartialOrd + Send + Sync + Clone,
-    Output: PartialOrd + Send + Sync + Clone,
+    Input: PartialOrd + Send + Sync + Copy + Debug,
+    Output: PartialOrd + Send + Sync + Copy + Debug,
 {
    MonotonicMappingColumn {
        from_column,
@@ -191,8 +192,8 @@ impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
 where
    C: Column<Input>,
    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
-    Input: PartialOrd + Send + Sync + Clone,
-    Output: PartialOrd + Send + Sync + Clone,
+    Input: PartialOrd + Send + Sync + Copy + Debug,
+    Output: PartialOrd + Send + Sync + Copy + Debug,
 {
    #[inline]
    fn get_val(&self, idx: u32) -> Output {
@@ -228,12 +229,15 @@ where
        doc_id_range: Range<u32>,
        positions: &mut Vec<u32>,
    ) {
-        self.from_column.get_docids_for_value_range(
-            self.monotonic_mapping.inverse(range.start().clone())
-                ..=self.monotonic_mapping.inverse(range.end().clone()),
-            doc_id_range,
-            positions,
-        )
+        if range.start() > &self.max_value() || range.end() < &self.min_value() {
+            return;
+        }
+        let range = self.monotonic_mapping.inverse_coerce(range);
+        if range.start() > range.end() {
+            return;
+        }
+        self.from_column
+            .get_docids_for_value_range(range, doc_id_range, positions)
    }

    // We voluntarily do not implement get_range as it yields a regression,
@@ -254,7 +258,7 @@ where T: Iterator + Clone + ExactSizeIterator
 impl<T> Column<T::Item> for IterColumn<T>
 where
    T: Iterator + Clone + ExactSizeIterator + Send + Sync,
-    T::Item: PartialOrd,
+    T::Item: PartialOrd + fmt::Debug,
 {
    fn get_val(&self, idx: u32) -> T::Item {
        self.0.clone().nth(idx as usize).unwrap()
--- a/fastfield_codecs/src/compact_space/mod.rs
+++ b/fastfield_codecs/src/compact_space/mod.rs
@@ -455,6 +455,8 @@ impl CompactSpaceDecompressor {
 #[cfg(test)]
 mod tests {

+    use std::fmt;
+
    use super::*;
    use crate::format_version::read_format_version;
    use crate::null_index_footer::read_null_index_footer;
@@ -708,7 +710,7 @@ mod tests {
        );
    }

-    fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
+    fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd + fmt::Debug>(
        column: &C,
        value_range: RangeInclusive<T>,
        doc_id_range: Range<u32>,
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -14,9 +14,9 @@ extern crate more_asserts;
 #[cfg(all(test, feature = "unstable"))]
 extern crate test;

-use std::io;
 use std::io::Write;
 use std::sync::Arc;
+use std::{fmt, io};

 use common::BinarySerializable;
 use compact_space::CompactSpaceDecompressor;
@@ -37,16 +37,12 @@ mod line;
 mod linear;
 mod monotonic_mapping;
 mod monotonic_mapping_u128;
-mod null_index;
 mod null_index_footer;

 mod column;
 mod gcd;
 mod serialize;

-/// TODO: remove when codec is used
-pub use null_index::*;
-
 use self::bitpacked::BitpackedCodec;
 use self::blockwise_linear::BlockwiseLinearCodec;
 pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
@@ -136,7 +132,7 @@ impl U128FastFieldCodecType {
 }

 /// Returns the correct codec reader wrapped in the `Arc` for the data.
-pub fn open_u128<Item: MonotonicallyMappableToU128>(
+pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
    bytes: OwnedBytes,
 ) -> io::Result<Arc<dyn Column<Item>>> {
    let (bytes, _format_version) = read_format_version(bytes)?;
@@ -150,7 +146,9 @@ pub fn open_u128<Item: MonotonicallyMappableToU128>(
 }

 /// Returns the correct codec reader wrapped in the `Arc` for the data.
-pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
+pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
+    bytes: OwnedBytes,
+) -> io::Result<Arc<dyn Column<T>>> {
    let (bytes, _format_version) = read_format_version(bytes)?;
    let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
    let header = Header::deserialize(&mut bytes)?;
@@ -163,7 +161,7 @@ pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<
    }
 }

-fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
+fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
    bytes: OwnedBytes,
    header: &Header,
 ) -> io::Result<Arc<dyn Column<Item>>> {
@@ -324,6 +322,9 @@ mod tests {
    pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
        let mut data_and_names = vec![];

+        let data = vec![10];
+        data_and_names.push((data, "minimal test"));
+
        let data = (10..=10_000_u64).collect::<Vec<_>>();
        data_and_names.push((data, "simple monotonically increasing"));

@@ -331,6 +332,9 @@ mod tests {
            vec![5, 6, 7, 8, 9, 10, 99, 100],
            "offset in linear interpol",
        ));
+
+        data_and_names.push((vec![3, 18446744073709551613, 5], "docid range regression"));
+
        data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
        data_and_names.push((vec![10], "single value"));

--- a/fastfield_codecs/src/monotonic_mapping.rs
+++ b/fastfield_codecs/src/monotonic_mapping.rs
@@ -1,4 +1,6 @@
+use std::fmt;
 use std::marker::PhantomData;
+use std::ops::RangeInclusive;

 use fastdivide::DividerU64;

@@ -6,7 +8,9 @@ use crate::MonotonicallyMappableToU128;

 /// Monotonic maps a value to u64 value space.
 /// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
-pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
+pub trait MonotonicallyMappableToU64:
+    'static + PartialOrd + Copy + Send + Sync + fmt::Debug
+{
    /// Converts a value to u64.
    ///
    /// Internally all fast field values are encoded as u64.
@@ -29,11 +33,29 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync
 /// mapping from their range to their domain. The `inverse` method is required when opening a codec,
 /// so a value can be converted back to its original domain (e.g. ip address or f64) from its
 /// internal representation.
-pub trait StrictlyMonotonicFn<External, Internal> {
+pub trait StrictlyMonotonicFn<External: Copy, Internal: Copy> {
    /// Strictly monotonically maps the value from External to Internal.
    fn mapping(&self, inp: External) -> Internal;
    /// Inverse of `mapping`. Maps the value from Internal to External.
    fn inverse(&self, out: Internal) -> External;
+
+    /// Maps a user provded value from External to Internal.
+    /// It may be necessary to coerce the value if it is outside the value space.
+    /// In that case it tries to find the next greater value in the value space.
+    ///
+    /// Returns a bool to mark if a value was outside the value space and had to be coerced _up_.
+    /// With that information we can detect if two values in a range both map outside the same value
+    /// space.
+    ///
+    /// coerce_up means the next valid upper value in the value space will be chosen if the value
+    /// has to be coerced.
+    fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<Internal> {
+        self.mapping(*inp.start())..=self.mapping(*inp.end())
+    }
+    /// Inverse of `mapping_coerce`.
+    fn inverse_coerce(&self, out: RangeInclusive<Internal>) -> RangeInclusive<External> {
+        self.inverse(*out.start())..=self.inverse(*out.end())
+    }
 }

 /// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn<A, B>` to
@@ -54,7 +76,10 @@ impl<T> From<T> for StrictlyMonotonicMappingInverter<T> {
 }

 impl<From, To, T> StrictlyMonotonicFn<To, From> for StrictlyMonotonicMappingInverter<T>
-where T: StrictlyMonotonicFn<From, To>
+where
+    T: StrictlyMonotonicFn<From, To>,
+    From: Copy,
+    To: Copy,
 {
    fn mapping(&self, val: To) -> From {
        self.orig_mapping.inverse(val)
@@ -63,6 +88,15 @@ where T: StrictlyMonotonicFn<From, To>
    fn inverse(&self, val: From) -> To {
        self.orig_mapping.mapping(val)
    }
+
+    #[inline]
+    fn mapping_coerce(&self, inp: RangeInclusive<To>) -> RangeInclusive<From> {
+        self.orig_mapping.inverse_coerce(inp)
+    }
+    #[inline]
+    fn inverse_coerce(&self, out: RangeInclusive<From>) -> RangeInclusive<To> {
+        self.orig_mapping.mapping_coerce(out)
+    }
 }

 /// Applies the strictly monotonic mapping from `T` without any additional changes.
@@ -134,6 +168,31 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
    fn inverse(&self, out: u64) -> External {
        External::from_u64(self.min_value + out * self.gcd)
    }
+
+    #[inline]
+    #[allow(clippy::reversed_empty_ranges)]
+    fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
+        let end = External::to_u64(*inp.end());
+        if end < self.min_value || inp.end() < inp.start() {
+            return 1..=0;
+        }
+        let map_coerce = |mut inp, coerce_up| {
+            let inp_lower_bound = self.inverse(0);
+            if inp < inp_lower_bound {
+                inp = inp_lower_bound;
+            }
+            let val = External::to_u64(inp);
+            let need_coercion = coerce_up && (val - self.min_value) % self.gcd != 0;
+            let mut mapped_val = self.mapping(inp);
+            if need_coercion {
+                mapped_val += 1;
+            }
+            mapped_val
+        };
+        let start = map_coerce(*inp.start(), true);
+        let end = map_coerce(*inp.end(), false);
+        start..=end
+    }
 }

 /// Strictly monotonic mapping with a base value.
@@ -149,6 +208,17 @@ impl StrictlyMonotonicMappingToInternalBaseval {
 impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
    for StrictlyMonotonicMappingToInternalBaseval
 {
+    #[inline]
+    #[allow(clippy::reversed_empty_ranges)]
+    fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
+        if External::to_u64(*inp.end()) < self.min_value {
+            return 1..=0;
+        }
+        let start = self.mapping(External::to_u64(*inp.start()).max(self.min_value));
+        let end = self.mapping(External::to_u64(*inp.end()));
+        start..=end
+    }
+
    fn mapping(&self, val: External) -> u64 {
        External::to_u64(val) - self.min_value
    }
@@ -224,7 +294,7 @@ mod tests {
        test_round_trip::<_, _, u64>(&mapping, 100u64);
    }

-    fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(
+    fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L: Copy>(
        mapping: &T,
        test_val: K,
    ) {
--- a/fastfield_codecs/src/monotonic_mapping_u128.rs
+++ b/fastfield_codecs/src/monotonic_mapping_u128.rs
@@ -1,8 +1,11 @@
+use std::fmt;
 use std::net::Ipv6Addr;

 /// Montonic maps a value to u128 value space
 /// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
-pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
+pub trait MonotonicallyMappableToU128:
+    'static + PartialOrd + Copy + Send + Sync + fmt::Debug
+{
    /// Converts a value to u128.
    ///
    /// Internally all fast field values are encoded as u64.
--- a/fastfield_codecs/src/null_index/dense.rs
+++ b/fastfield_codecs/src/null_index/dense.rs
@@ -1,453 +0,0 @@
-use std::convert::TryInto;
-use std::io::{self, Write};
-
-use common::BinarySerializable;
-use itertools::Itertools;
-use ownedbytes::OwnedBytes;
-
-use super::{get_bit_at, set_bit_at};
-
-/// For the `DenseCodec`, `data` which contains the encoded blocks.
-/// Each block consists of [u8; 12]. The first 8 bytes is a bitvec for 64 elements.
-/// The last 4 bytes are the offset, the number of set bits so far.
-///
-/// When translating the original index to a dense index, the correct block can be computed
-/// directly `orig_idx/64`. Inside the block the position is `orig_idx%64`.
-///
-/// When translating a dense index to the original index, we can use the offset to find the correct
-/// block. Direct computation is not possible, but we can employ a linear or binary search.
-pub struct DenseCodec {
-    // data consists of blocks of 64 bits.
-    //
-    // The format is &[(u64, u32)]
-    // u64 is the bitvec
-    // u32 is the offset of the block, the number of set bits so far.
-    //
-    // At the end one block is appended, to store the number of values in the index in offset.
-    data: OwnedBytes,
-}
-const ELEMENTS_PER_BLOCK: u32 = 64;
-const BLOCK_BITVEC_SIZE: usize = 8;
-const BLOCK_OFFSET_SIZE: usize = 4;
-const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE;
-
-#[inline]
-fn count_ones(bitvec: u64, pos_in_bitvec: u32) -> u32 {
-    if pos_in_bitvec == 63 {
-        bitvec.count_ones()
-    } else {
-        let mask = (1u64 << (pos_in_bitvec + 1)) - 1;
-        let masked_bitvec = bitvec & mask;
-        masked_bitvec.count_ones()
-    }
-}
-
-#[derive(Clone, Copy)]
-struct DenseIndexBlock {
-    bitvec: u64,
-    offset: u32,
-}
-
-impl From<[u8; SERIALIZED_BLOCK_SIZE]> for DenseIndexBlock {
-    fn from(data: [u8; SERIALIZED_BLOCK_SIZE]) -> Self {
-        let bitvec = u64::from_le_bytes(data[..BLOCK_BITVEC_SIZE].try_into().unwrap());
-        let offset = u32::from_le_bytes(data[BLOCK_BITVEC_SIZE..].try_into().unwrap());
-        Self { bitvec, offset }
-    }
-}
-
-impl DenseCodec {
-    /// Open the DenseCodec from OwnedBytes
-    pub fn open(data: OwnedBytes) -> Self {
-        Self { data }
-    }
-    #[inline]
-    /// Check if value at position is not null.
-    pub fn exists(&self, idx: u32) -> bool {
-        let block_pos = idx / ELEMENTS_PER_BLOCK;
-        let bitvec = self.dense_index_block(block_pos).bitvec;
-
-        let pos_in_bitvec = idx % ELEMENTS_PER_BLOCK;
-
-        get_bit_at(bitvec, pos_in_bitvec)
-    }
-    #[inline]
-    fn dense_index_block(&self, block_pos: u32) -> DenseIndexBlock {
-        dense_index_block(&self.data, block_pos)
-    }
-
-    /// Return the number of non-null values in an index
-    pub fn num_non_null_vals(&self) -> u32 {
-        let last_block = (self.data.len() / SERIALIZED_BLOCK_SIZE) - 1;
-        self.dense_index_block(last_block as u32).offset
-    }
-
-    #[inline]
-    /// Translate from the original index to the codec index.
-    pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> {
-        let block_pos = idx / ELEMENTS_PER_BLOCK;
-        let index_block = self.dense_index_block(block_pos);
-        let pos_in_block_bit_vec = idx % ELEMENTS_PER_BLOCK;
-        let ones_in_block = count_ones(index_block.bitvec, pos_in_block_bit_vec);
-        if get_bit_at(index_block.bitvec, pos_in_block_bit_vec) {
-            // -1 is ok, since idx does exist, so there's at least one
-            Some(index_block.offset + ones_in_block - 1)
-        } else {
-            None
-        }
-    }
-
-    /// Translate positions from the codec index to the original index.
-    ///
-    /// # Panics
-    ///
-    /// May panic if any `idx` is greater than the column length.
-    pub fn translate_codec_idx_to_original_idx<'a>(
-        &'a self,
-        iter: impl Iterator<Item = u32> + 'a,
-    ) -> impl Iterator<Item = u32> + 'a {
-        let mut block_pos = 0u32;
-        iter.map(move |dense_idx| {
-            // update block_pos to limit search scope
-            block_pos = find_block(dense_idx, block_pos, &self.data);
-            let index_block = self.dense_index_block(block_pos);
-
-            // The next offset is higher than dense_idx and therefore:
-            // dense_idx <= offset + num_set_bits in block
-            let mut num_set_bits = 0;
-            for idx_in_bitvec in 0..ELEMENTS_PER_BLOCK {
-                if get_bit_at(index_block.bitvec, idx_in_bitvec) {
-                    num_set_bits += 1;
-                }
-                if num_set_bits == (dense_idx - index_block.offset + 1) {
-                    let orig_idx = block_pos * ELEMENTS_PER_BLOCK + idx_in_bitvec as u32;
-                    return orig_idx;
-                }
-            }
-            panic!("Internal Error: Offset calculation in dense idx seems to be wrong.");
-        })
-    }
-}
-
-#[inline]
-fn dense_index_block(data: &[u8], block_pos: u32) -> DenseIndexBlock {
-    let data_start_pos = block_pos as usize * SERIALIZED_BLOCK_SIZE;
-    let block_data: [u8; SERIALIZED_BLOCK_SIZE] = data[data_start_pos..][..SERIALIZED_BLOCK_SIZE]
-        .try_into()
-        .unwrap();
-    block_data.into()
-}
-
-#[inline]
-/// Finds the block position containing the dense_idx.
-///
-/// # Correctness
-/// dense_idx needs to be smaller than the number of values in the index
-///
-/// The last offset number is equal to the number of values in the index.
-fn find_block(dense_idx: u32, mut block_pos: u32, data: &[u8]) -> u32 {
-    loop {
-        let offset = dense_index_block(data, block_pos).offset;
-        if offset > dense_idx {
-            return block_pos - 1;
-        }
-        block_pos += 1;
-    }
-}
-
-/// Iterator over all values, true if set, otherwise false
-pub fn serialize_dense_codec(
-    iter: impl Iterator<Item = bool>,
-    mut out: impl Write,
-) -> io::Result<()> {
-    let mut offset: u32 = 0;
-
-    for chunk in &iter.chunks(ELEMENTS_PER_BLOCK as usize) {
-        let mut block: u64 = 0;
-        for (pos, is_bit_set) in chunk.enumerate() {
-            if is_bit_set {
-                set_bit_at(&mut block, pos as u64);
-            }
-        }
-
-        block.serialize(&mut out)?;
-        offset.serialize(&mut out)?;
-
-        offset += block.count_ones() as u32;
-    }
-    // Add sentinal block for the offset
-    let block: u64 = 0;
-    block.serialize(&mut out)?;
-    offset.serialize(&mut out)?;
-
-    Ok(())
-}
-
-#[cfg(test)]
-mod tests {
-    use proptest::prelude::{any, prop, *};
-    use proptest::strategy::Strategy;
-    use proptest::{prop_oneof, proptest};
-
-    use super::*;
-
-    fn random_bitvec() -> BoxedStrategy<Vec<bool>> {
-        prop_oneof![
-            1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..100),
-            1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..64),
-            1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..100),
-            1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..64),
-            8 => vec![any::<bool>()],
-            2 => prop::collection::vec(any::<bool>(), 0..50),
-        ]
-        .boxed()
-    }
-
-    proptest! {
-        #![proptest_config(ProptestConfig::with_cases(500))]
-        #[test]
-        fn test_with_random_bitvecs(bitvec1 in random_bitvec(), bitvec2 in random_bitvec(), bitvec3 in random_bitvec()) {
-            let mut bitvec = Vec::new();
-            bitvec.extend_from_slice(&bitvec1);
-            bitvec.extend_from_slice(&bitvec2);
-            bitvec.extend_from_slice(&bitvec3);
-            test_null_index(bitvec);
-        }
-    }
-
-    #[test]
-    fn dense_codec_test_one_block_false() {
-        let mut iter = vec![false; 64];
-        iter.push(true);
-        test_null_index(iter);
-    }
-
-    fn test_null_index(data: Vec<bool>) {
-        let mut out = vec![];
-
-        serialize_dense_codec(data.iter().cloned(), &mut out).unwrap();
-        let null_index = DenseCodec::open(OwnedBytes::new(out));
-
-        let orig_idx_with_value: Vec<u32> = data
-            .iter()
-            .enumerate()
-            .filter(|(_pos, val)| **val)
-            .map(|(pos, _val)| pos as u32)
-            .collect();
-
-        assert_eq!(
-            null_index
-                .translate_codec_idx_to_original_idx(0..orig_idx_with_value.len() as u32)
-                .collect_vec(),
-            orig_idx_with_value
-        );
-
-        for (dense_idx, orig_idx) in orig_idx_with_value.iter().enumerate() {
-            assert_eq!(
-                null_index.translate_to_codec_idx(*orig_idx),
-                Some(dense_idx as u32)
-            );
-        }
-
-        for (pos, value) in data.iter().enumerate() {
-            assert_eq!(null_index.exists(pos as u32), *value);
-        }
-    }
-
-    #[test]
-    fn dense_codec_test_translation() {
-        let mut out = vec![];
-
-        let iter = ([true, false, true, false]).iter().cloned();
-        serialize_dense_codec(iter, &mut out).unwrap();
-        let null_index = DenseCodec::open(OwnedBytes::new(out));
-
-        assert_eq!(
-            null_index
-                .translate_codec_idx_to_original_idx(0..2)
-                .collect_vec(),
-            vec![0, 2]
-        );
-    }
-
-    #[test]
-    fn dense_codec_translate() {
-        let mut out = vec![];
-
-        let iter = ([true, false, true, false]).iter().cloned();
-        serialize_dense_codec(iter, &mut out).unwrap();
-        let null_index = DenseCodec::open(OwnedBytes::new(out));
-        assert_eq!(null_index.translate_to_codec_idx(0), Some(0));
-        assert_eq!(null_index.translate_to_codec_idx(2), Some(1));
-    }
-
-    #[test]
-    fn dense_codec_test_small() {
-        let mut out = vec![];
-
-        let iter = ([true, false, true, false]).iter().cloned();
-        serialize_dense_codec(iter, &mut out).unwrap();
-        let null_index = DenseCodec::open(OwnedBytes::new(out));
-        assert!(null_index.exists(0));
-        assert!(!null_index.exists(1));
-        assert!(null_index.exists(2));
-        assert!(!null_index.exists(3));
-    }
-
-    #[test]
-    fn dense_codec_test_large() {
-        let mut docs = vec![];
-        docs.extend((0..1000).map(|_idx| false));
-        docs.extend((0..=1000).map(|_idx| true));
-
-        let iter = docs.iter().cloned();
-        let mut out = vec![];
-        serialize_dense_codec(iter, &mut out).unwrap();
-        let null_index = DenseCodec::open(OwnedBytes::new(out));
-        assert!(!null_index.exists(0));
-        assert!(!null_index.exists(100));
-        assert!(!null_index.exists(999));
-        assert!(null_index.exists(1000));
-        assert!(null_index.exists(1999));
-        assert!(null_index.exists(2000));
-        assert!(!null_index.exists(2001));
-    }
-
-    #[test]
-    fn test_count_ones() {
-        let mut block = 0;
-        set_bit_at(&mut block, 0);
-        set_bit_at(&mut block, 2);
-
-        assert_eq!(count_ones(block, 0), 1);
-        assert_eq!(count_ones(block, 1), 1);
-        assert_eq!(count_ones(block, 2), 2);
-    }
-}
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-
-    use rand::rngs::StdRng;
-    use rand::{Rng, SeedableRng};
-    use test::Bencher;
-
-    use super::*;
-
-    const TOTAL_NUM_VALUES: u32 = 1_000_000;
-    fn gen_bools(fill_ratio: f64) -> DenseCodec {
-        let mut out = Vec::new();
-        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        let bools: Vec<_> = (0..TOTAL_NUM_VALUES)
-            .map(|_| rng.gen_bool(fill_ratio))
-            .collect();
-        serialize_dense_codec(bools.into_iter(), &mut out).unwrap();
-
-        let codec = DenseCodec::open(OwnedBytes::new(out));
-        codec
-    }
-
-    fn random_range_iterator(start: u32, end: u32, step_size: u32) -> impl Iterator<Item = u32> {
-        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        let mut current = start;
-        std::iter::from_fn(move || {
-            current += rng.gen_range(1..step_size + 1);
-            if current >= end {
-                None
-            } else {
-                Some(current)
-            }
-        })
-    }
-
-    fn walk_over_data(codec: &DenseCodec, max_step_size: u32) -> Option<u32> {
-        walk_over_data_from_positions(
-            codec,
-            random_range_iterator(0, TOTAL_NUM_VALUES, max_step_size),
-        )
-    }
-
-    fn walk_over_data_from_positions(
-        codec: &DenseCodec,
-        positions: impl Iterator<Item = u32>,
-    ) -> Option<u32> {
-        let mut dense_idx: Option<u32> = None;
-        for idx in positions {
-            dense_idx = dense_idx.or(codec.translate_to_codec_idx(idx));
-        }
-        dense_idx
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense_90percent_filled_random_stride(
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(0.9f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense_50percent_filled_random_stride(
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(0.5f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense_full_scan_10percent(bench: &mut Bencher) {
-        let codec = gen_bools(0.1f64);
-        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense_full_scan_90percent(bench: &mut Bencher) {
-        let codec = gen_bools(0.9f64);
-        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense_10percent_filled_random_stride(
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(0.1f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride_big_step(
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(0.9f64);
-        let num_vals = codec.num_non_null_vals();
-        bench.iter(|| {
-            codec
-                .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
-                .last()
-        });
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride(
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(0.9f64);
-        let num_vals = codec.num_non_null_vals();
-        bench.iter(|| {
-            codec
-                .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
-                .last()
-        });
-    }
-
-    #[bench]
-    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
-        let codec = gen_bools(0.9f64);
-        let num_vals = codec.num_non_null_vals();
-        bench.iter(|| {
-            codec
-                .translate_codec_idx_to_original_idx(0..num_vals)
-                .last()
-        });
-    }
-}
--- a/fastfield_codecs/src/null_index/mod.rs
+++ b/fastfield_codecs/src/null_index/mod.rs
@@ -1,13 +0,0 @@
-pub use dense::{serialize_dense_codec, DenseCodec};
-
-mod dense;
-
-#[inline]
-fn get_bit_at(input: u64, n: u32) -> bool {
-    input & (1 << n) != 0
-}
-
-#[inline]
-fn set_bit_at(input: &mut u64, n: u64) {
-    *input |= 1 << n;
-}
--- a/fastfield_codecs/src/serialize.rs
+++ b/fastfield_codecs/src/serialize.rs
@@ -17,9 +17,9 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program. If not, see <http://www.gnu.org/licenses/>.

-use std::io;
 use std::num::NonZeroU64;
 use std::sync::Arc;
+use std::{fmt, io};

 use common::{BinarySerializable, VInt};
 use log::warn;
@@ -168,7 +168,7 @@ impl BinarySerializable for Header {

 /// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
 /// compression.
-pub fn estimate<T: MonotonicallyMappableToU64>(
+pub fn estimate<T: MonotonicallyMappableToU64 + fmt::Debug>(
    typed_column: impl Column<T>,
    codec_type: FastFieldCodecType,
 ) -> Option<f32> {
@@ -214,7 +214,7 @@ pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
 }

 /// Serializes the column with the codec with the best estimate on the data.
-pub fn serialize<T: MonotonicallyMappableToU64>(
+pub fn serialize<T: MonotonicallyMappableToU64 + fmt::Debug>(
    typed_column: impl Column<T>,
    output: &mut impl io::Write,
    codecs: &[FastFieldCodecType],
@@ -294,7 +294,7 @@ fn serialize_given_codec(
 }

 /// Helper function to serialize a column (autodetect from all codecs) and then open it
-pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
+pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default + fmt::Debug>(
    column: &[T],
 ) -> Arc<dyn Column<T>> {
    let mut buffer = Vec::new();
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -145,7 +145,7 @@ impl FastFieldType {
 mod tests {

    use std::collections::HashMap;
-    use std::ops::Range;
+    use std::ops::{Range, RangeInclusive};
    use std::path::Path;
    use std::sync::Arc;

@@ -159,7 +159,9 @@ mod tests {
    use super::*;
    use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
    use crate::merge_policy::NoMergePolicy;
-    use crate::schema::{Cardinality, Document, Field, Schema, SchemaBuilder, FAST, STRING, TEXT};
+    use crate::schema::{
+        Cardinality, Document, Field, Schema, SchemaBuilder, FAST, INDEXED, STRING, TEXT,
+    };
    use crate::time::OffsetDateTime;
    use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};

@@ -969,4 +971,117 @@ mod tests {
        }
        Ok(len)
    }
+
+    #[test]
+    fn test_gcd_bug_regression_1757() {
+        let mut schema_builder = Schema::builder();
+        let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            let mut writer = index.writer_for_tests().unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 100u64,
+                })
+                .unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 200u64,
+                })
+                .unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 300u64,
+                })
+                .unwrap();
+
+            writer.commit().unwrap();
+        }
+
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let segment = &searcher.segment_readers()[0];
+        let field = segment.fast_fields().u64(num_field).unwrap();
+
+        let numbers = vec![100, 200, 300];
+        let test_range = |range: RangeInclusive<u64>| {
+            let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
+            let mut vec = vec![];
+            field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
+            assert_eq!(vec.len(), expexted_count);
+        };
+        test_range(50..=50);
+        test_range(150..=150);
+        test_range(350..=350);
+        test_range(100..=250);
+        test_range(101..=200);
+        test_range(101..=199);
+        test_range(100..=300);
+        test_range(100..=299);
+    }
+
+    #[test]
+    fn test_mapping_bug_docids_for_value_range() {
+        let mut schema_builder = Schema::builder();
+        let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            // Values without gcd, but with min_value
+            let mut writer = index.writer_for_tests().unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 1000u64,
+                })
+                .unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 1001u64,
+                })
+                .unwrap();
+            writer
+                .add_document(doc! {
+                    num_field => 1003u64,
+                })
+                .unwrap();
+            writer.commit().unwrap();
+        }
+
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let segment = &searcher.segment_readers()[0];
+        let field = segment.fast_fields().u64(num_field).unwrap();
+
+        let numbers = vec![1000, 1001, 1003];
+        let test_range = |range: RangeInclusive<u64>| {
+            let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
+            let mut vec = vec![];
+            field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
+            assert_eq!(vec.len(), expexted_count);
+        };
+        let test_range_variant = |start, stop| {
+            let start_range = start..=stop;
+            test_range(start_range);
+            let start_range = start..=(stop - 1);
+            test_range(start_range);
+            let start_range = start..=(stop + 1);
+            test_range(start_range);
+            let start_range = (start - 1)..=stop;
+            test_range(start_range);
+            let start_range = (start - 1)..=(stop - 1);
+            test_range(start_range);
+            let start_range = (start - 1)..=(stop + 1);
+            test_range(start_range);
+            let start_range = (start + 1)..=stop;
+            test_range(start_range);
+            let start_range = (start + 1)..=(stop - 1);
+            test_range(start_range);
+            let start_range = (start + 1)..=(stop + 1);
+            test_range(start_range);
+        };
+        test_range_variant(50, 50);
+        test_range_variant(1000, 1000);
+        test_range_variant(1000, 1002);
+    }
 }
--- a/src/fastfield/serializer/mod.rs
+++ b/src/fastfield/serializer/mod.rs
@@ -1,3 +1,4 @@
+use std::fmt;
 use std::io::{self, Write};

 pub use fastfield_codecs::Column;
@@ -49,7 +50,7 @@ impl CompositeFastFieldSerializer {

    /// Serialize data into a new u64 fast field. The best compression codec will be chosen
    /// automatically.
-    pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64>(
+    pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>(
        &mut self,
        field: Field,
        fastfield_accessor: impl Column<T>,
@@ -59,7 +60,9 @@ impl CompositeFastFieldSerializer {

    /// Serialize data into a new u64 fast field. The best compression codec will be chosen
    /// automatically.
-    pub fn create_auto_detect_u64_fast_field_with_idx<T: MonotonicallyMappableToU64>(
+    pub fn create_auto_detect_u64_fast_field_with_idx<
+        T: MonotonicallyMappableToU64 + fmt::Debug,
+    >(
        &mut self,
        field: Field,
        fastfield_accessor: impl Column<T>,
@@ -72,7 +75,9 @@ impl CompositeFastFieldSerializer {

    /// Serialize data into a new u64 fast field. The best compression codec of the the provided
    /// will be chosen.
-    pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<T: MonotonicallyMappableToU64>(
+    pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<
+        T: MonotonicallyMappableToU64 + fmt::Debug,
+    >(
        &mut self,
        field: Field,
        fastfield_accessor: impl Column<T>,
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -447,8 +447,8 @@ impl SegmentUpdater {
            let segment_entries = segment_updater.purge_deletes(opstamp)?;
            segment_updater.segment_manager.commit(segment_entries);
            segment_updater.save_metas(opstamp, payload)?;
-            // let _ = garbage_collect_files(segment_updater.clone());
-            // segment_updater.consider_merge_options();
+            let _ = garbage_collect_files(segment_updater.clone());
+            segment_updater.consider_merge_options();
            Ok(opstamp)
        })
    }
--- a/src/store/index/block.rs
+++ b/src/store/index/block.rs
@@ -90,7 +90,7 @@ impl CheckpointBlock {
            return Ok(());
        }
        let mut doc = read_u32_vint(data);
-        let mut start_offset = read_u32_vint(data) as usize;
+        let mut start_offset = VInt::deserialize_u64(data)? as usize;
        for _ in 0..len {
            let num_docs = read_u32_vint(data);
            let block_num_bytes = read_u32_vint(data) as usize;
@@ -147,6 +147,15 @@ mod tests {
        test_aux_ser_deser(&checkpoints)
    }

+    #[test]
+    fn test_block_serialize_large_byte_range() -> io::Result<()> {
+        let checkpoints = vec![Checkpoint {
+            doc_range: 10..12,
+            byte_range: 8_000_000_000..9_000_000_000,
+        }];
+        test_aux_ser_deser(&checkpoints)
+    }
+
    #[test]
    fn test_block_serialize() -> io::Result<()> {
        let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();
Author	SHA1	Message	Date
Pascal Seitz	6761237ec7	chore: Release 0.19.2	2023-02-10 12:20:20 +08:00
Pascal Seitz	3da08e92c7	fix: doc store for files larger 4GB Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32. `get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB). Causes index corruption, if a segment with a doc store larger 4GB is merged. tantivy version 0.19 is affected	2023-02-10 12:12:47 +08:00
Pascal Seitz	6c4b8d97ed	chore: Release	2023-01-13 13:46:28 +08:00
Pascal Seitz	dc5f503c9a	use fastfield_codecs 0.3.1	2023-01-13 13:34:42 +08:00
Pascal Seitz	4ffcf3a933	chore: Release	2023-01-13 13:31:20 +08:00
Pascal Seitz	079f542f97	handle user input on get_docid_for_value_range	2023-01-13 12:24:34 +08:00
PSeitz	509adab79d	Bump version (#1715 ) * group workspace deps * update cargo.toml * revert tant version * chore: Release	2022-12-12 04:39:43 +01:00