Disable GC and merge checker.

Merge pull request #1711 from quickwit-oss/sparse_dense_index
add dense codec
2026-02-23 08:10:36 +00:00 · 2022-12-11 14:04:20 +00:00 · 2022-12-09 08:48:43 +01:00 · 2022-12-09 15:21:25 +08:00 · 2022-12-09 08:01:56 +01:00 · 2022-12-09 08:01:02 +01:00
10 changed files with 179 additions and 75 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.19.0-dev"
+version = "0.19.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -36,11 +36,6 @@ fs2 = { version = "0.4.3", optional = true }
 levenshtein_automata = "0.2.1"
 uuid = { version = "1.0.0", features = ["v4", "serde"] }
 crossbeam-channel = "0.5.4"
-tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
-tantivy-bitpacker = { version="0.2", path="./bitpacker" }
-common = { version = "0.3", path = "./common/", package = "tantivy-common" }
-fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
-ownedbytes = { version="0.3", path="./ownedbytes" }
 stable_deref_trait = "1.2.0"
 rust-stemmers = "1.2.0"
 downcast-rs = "1.2.0"
@@ -62,6 +57,12 @@ ciborium = { version = "0.2", optional = true}
 async-trait = "0.1.53"
 arc-swap = "1.5.0"

+tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
+tantivy-bitpacker = 		{ version= "0.3", path="./bitpacker" }
+common = 								{ version= "0.4", path = "./common/", package = "tantivy-common" }
+fastfield_codecs = 			{ version= "0.3", path="./fastfield_codecs", default-features = false }
+ownedbytes = 						{ version= "0.4", path="./ownedbytes" }
+
 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"

--- a/bitpacker/Cargo.toml
+++ b/bitpacker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy-bitpacker"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
@@ -8,6 +8,8 @@ categories = []
 description = """Tantivy-sub crate: bitpacking"""
 repository = "https://github.com/quickwit-oss/tantivy"
 keywords = []
+documentation = "https://docs.rs/tantivy-bitpacker/latest/tantivy_bitpacker"
+homepage = "https://github.com/quickwit-oss/tantivy"


 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -1,16 +1,20 @@
 [package]
 name = "tantivy-common"
-version = "0.3.0"
+version = "0.4.0"
 authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
 edition = "2021"
 description = "common traits and utility functions used by multiple tantivy subcrates"
+documentation = "https://docs.rs/tantivy_common/"
+homepage = "https://github.com/quickwit-oss/tantivy"
+repository = "https://github.com/quickwit-oss/tantivy"
+

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 byteorder = "1.4.3"
-ownedbytes = { version="0.3", path="../ownedbytes" }
+ownedbytes = { version= "0.4", path="../ownedbytes" }

 [dev-dependencies]
 proptest = "1.0.0"
--- a/fastfield_codecs/Cargo.toml
+++ b/fastfield_codecs/Cargo.toml
@@ -1,17 +1,20 @@
 [package]
 name = "fastfield_codecs"
-version = "0.2.0"
+version = "0.3.0"
 authors = ["Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
 edition = "2021"
 description = "Fast field codecs used by tantivy"
+documentation = "https://docs.rs/fastfield_codecs/"
+homepage = "https://github.com/quickwit-oss/tantivy"
+repository = "https://github.com/quickwit-oss/tantivy"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-common = { version = "0.3", path = "../common/", package = "tantivy-common" }
-tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
-ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
+common = { version = "0.4", path = "../common/", package = "tantivy-common" }
+tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
+ownedbytes = { version = "0.4.0", path = "../ownedbytes" }
 prettytable-rs = {version="0.9.0", optional= true}
 rand = {version="0.8.3", optional= true}
 fastdivide = "0.4"
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -44,6 +44,7 @@ mod column;
 mod gcd;
 mod serialize;

+/// TODO: remove when codec is used
 pub use null_index::*;

 use self::bitpacked::BitpackedCodec;
--- a/fastfield_codecs/src/null_index/dense.rs
+++ b/fastfield_codecs/src/null_index/dense.rs
@@ -1,3 +1,4 @@
+use std::convert::TryInto;
 use std::io::{self, Write};

 use common::BinarySerializable;
@@ -30,13 +31,28 @@ const BLOCK_BITVEC_SIZE: usize = 8;
 const BLOCK_OFFSET_SIZE: usize = 4;
 const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE;

-fn count_ones(block: u64, pos_in_block: u32) -> u32 {
-    if pos_in_block == 63 {
-        block.count_ones()
+#[inline]
+fn count_ones(bitvec: u64, pos_in_bitvec: u32) -> u32 {
+    if pos_in_bitvec == 63 {
+        bitvec.count_ones()
    } else {
-        let mask = (1u64 << (pos_in_block + 1)) - 1;
-        let masked_block = block & mask;
-        masked_block.count_ones()
+        let mask = (1u64 << (pos_in_bitvec + 1)) - 1;
+        let masked_bitvec = bitvec & mask;
+        masked_bitvec.count_ones()
+    }
+}
+
+#[derive(Clone, Copy)]
+struct DenseIndexBlock {
+    bitvec: u64,
+    offset: u32,
+}
+
+impl From<[u8; SERIALIZED_BLOCK_SIZE]> for DenseIndexBlock {
+    fn from(data: [u8; SERIALIZED_BLOCK_SIZE]) -> Self {
+        let bitvec = u64::from_le_bytes(data[..BLOCK_BITVEC_SIZE].try_into().unwrap());
+        let offset = u32::from_le_bytes(data[BLOCK_BITVEC_SIZE..].try_into().unwrap());
+        Self { bitvec, offset }
    }
 }

@@ -49,51 +65,33 @@ impl DenseCodec {
    /// Check if value at position is not null.
    pub fn exists(&self, idx: u32) -> bool {
        let block_pos = idx / ELEMENTS_PER_BLOCK;
-        let bitvec: u64 = self.block(block_pos as usize);
+        let bitvec = self.dense_index_block(block_pos).bitvec;

-        let pos_in_block = idx % ELEMENTS_PER_BLOCK;
+        let pos_in_bitvec = idx % ELEMENTS_PER_BLOCK;

-        get_bit_at(bitvec, pos_in_block)
+        get_bit_at(bitvec, pos_in_bitvec)
    }
    #[inline]
-    pub(crate) fn block(&self, block_pos: usize) -> u64 {
-        let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..];
-
-        let block: u64 =
-            BinarySerializable::deserialize(data).expect("could not read block in null index");
-        block
-    }
-
-    #[inline]
-    /// Returns (bitvec, offset)
-    ///
-    /// offset is the start offset of actual docids in the block.
-    pub(crate) fn block_and_offset(&self, block_pos: u32) -> (u64, u32) {
-        let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..];
-
-        let block: u64 =
-            BinarySerializable::deserialize(data).expect("could not read block in null index");
-        let offset: u32 =
-            BinarySerializable::deserialize(data).expect("could not read block in null index");
-        (block, offset)
+    fn dense_index_block(&self, block_pos: u32) -> DenseIndexBlock {
+        dense_index_block(&self.data, block_pos)
    }

    /// Return the number of non-null values in an index
    pub fn num_non_null_vals(&self) -> u32 {
        let last_block = (self.data.len() / SERIALIZED_BLOCK_SIZE) - 1;
-        self.block_and_offset(last_block as u32).1
+        self.dense_index_block(last_block as u32).offset
    }

    #[inline]
    /// Translate from the original index to the codec index.
    pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> {
        let block_pos = idx / ELEMENTS_PER_BLOCK;
-        let (block, offset) = self.block_and_offset(block_pos);
-        let pos_in_block = idx % ELEMENTS_PER_BLOCK;
-        if get_bit_at(block, pos_in_block) {
-            let ones_in_block = count_ones(block, pos_in_block);
-            Some(offset + ones_in_block - 1) // -1 is ok, since idx does exist, so there's at least
-                                             // one
+        let index_block = self.dense_index_block(block_pos);
+        let pos_in_block_bit_vec = idx % ELEMENTS_PER_BLOCK;
+        let ones_in_block = count_ones(index_block.bitvec, pos_in_block_bit_vec);
+        if get_bit_at(index_block.bitvec, pos_in_block_bit_vec) {
+            // -1 is ok, since idx does exist, so there's at least one
+            Some(index_block.offset + ones_in_block - 1)
        } else {
            None
        }
@@ -112,17 +110,17 @@ impl DenseCodec {
        iter.map(move |dense_idx| {
            // update block_pos to limit search scope
            block_pos = find_block(dense_idx, block_pos, &self.data);
-            let (bitvec, offset) = self.block_and_offset(block_pos);
+            let index_block = self.dense_index_block(block_pos);

            // The next offset is higher than dense_idx and therefore:
            // dense_idx <= offset + num_set_bits in block
            let mut num_set_bits = 0;
-            for idx_in_block in 0..ELEMENTS_PER_BLOCK {
-                if get_bit_at(bitvec, idx_in_block) {
+            for idx_in_bitvec in 0..ELEMENTS_PER_BLOCK {
+                if get_bit_at(index_block.bitvec, idx_in_bitvec) {
                    num_set_bits += 1;
                }
-                if num_set_bits == (dense_idx - offset + 1) {
-                    let orig_idx = block_pos * ELEMENTS_PER_BLOCK + idx_in_block as u32;
+                if num_set_bits == (dense_idx - index_block.offset + 1) {
+                    let orig_idx = block_pos * ELEMENTS_PER_BLOCK + idx_in_bitvec as u32;
                    return orig_idx;
                }
            }
@@ -131,6 +129,15 @@ impl DenseCodec {
    }
 }

+#[inline]
+fn dense_index_block(data: &[u8], block_pos: u32) -> DenseIndexBlock {
+    let data_start_pos = block_pos as usize * SERIALIZED_BLOCK_SIZE;
+    let block_data: [u8; SERIALIZED_BLOCK_SIZE] = data[data_start_pos..][..SERIALIZED_BLOCK_SIZE]
+        .try_into()
+        .unwrap();
+    block_data.into()
+}
+
 #[inline]
 /// Finds the block position containing the dense_idx.
 ///
@@ -140,11 +147,8 @@ impl DenseCodec {
 /// The last offset number is equal to the number of values in the index.
 fn find_block(dense_idx: u32, mut block_pos: u32, data: &[u8]) -> u32 {
    loop {
-        let data = &mut &data[BLOCK_BITVEC_SIZE + block_pos as usize * SERIALIZED_BLOCK_SIZE..];
-        let offset: u32 = BinarySerializable::deserialize(data)
-            .expect("could not read offset from block in null index");
+        let offset = dense_index_block(data, block_pos).offset;
        if offset > dense_idx {
-            // offset
            return block_pos - 1;
        }
        block_pos += 1;
@@ -330,32 +334,115 @@ mod bench {

    use super::*;

-    fn gen_bools() -> DenseCodec {
+    const TOTAL_NUM_VALUES: u32 = 1_000_000;
+    fn gen_bools(fill_ratio: f64) -> DenseCodec {
        let mut out = Vec::new();
        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        // 80% of values are set
-        let bools: Vec<_> = (0..100_000).map(|_| rng.gen_bool(8f64 / 10f64)).collect();
+        let bools: Vec<_> = (0..TOTAL_NUM_VALUES)
+            .map(|_| rng.gen_bool(fill_ratio))
+            .collect();
        serialize_dense_codec(bools.into_iter(), &mut out).unwrap();

        let codec = DenseCodec::open(OwnedBytes::new(out));
        codec
    }

-    #[bench]
-    fn bench_dense_codec_translate_orig_to_dense(bench: &mut Bencher) {
-        let codec = gen_bools();
-        bench.iter(|| {
-            let mut dense_idx: Option<u32> = None;
-            for idx in 0..100_000 {
-                dense_idx = codec.translate_to_codec_idx(idx);
+    fn random_range_iterator(start: u32, end: u32, step_size: u32) -> impl Iterator<Item = u32> {
+        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
+        let mut current = start;
+        std::iter::from_fn(move || {
+            current += rng.gen_range(1..step_size + 1);
+            if current >= end {
+                None
+            } else {
+                Some(current)
            }
-            dense_idx
+        })
+    }
+
+    fn walk_over_data(codec: &DenseCodec, max_step_size: u32) -> Option<u32> {
+        walk_over_data_from_positions(
+            codec,
+            random_range_iterator(0, TOTAL_NUM_VALUES, max_step_size),
+        )
+    }
+
+    fn walk_over_data_from_positions(
+        codec: &DenseCodec,
+        positions: impl Iterator<Item = u32>,
+    ) -> Option<u32> {
+        let mut dense_idx: Option<u32> = None;
+        for idx in positions {
+            dense_idx = dense_idx.or(codec.translate_to_codec_idx(idx));
+        }
+        dense_idx
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_orig_to_dense_90percent_filled_random_stride(
+        bench: &mut Bencher,
+    ) {
+        let codec = gen_bools(0.9f64);
+        bench.iter(|| walk_over_data(&codec, 100));
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_orig_to_dense_50percent_filled_random_stride(
+        bench: &mut Bencher,
+    ) {
+        let codec = gen_bools(0.5f64);
+        bench.iter(|| walk_over_data(&codec, 100));
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_orig_to_dense_full_scan_10percent(bench: &mut Bencher) {
+        let codec = gen_bools(0.1f64);
+        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_orig_to_dense_full_scan_90percent(bench: &mut Bencher) {
+        let codec = gen_bools(0.9f64);
+        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_orig_to_dense_10percent_filled_random_stride(
+        bench: &mut Bencher,
+    ) {
+        let codec = gen_bools(0.1f64);
+        bench.iter(|| walk_over_data(&codec, 100));
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride_big_step(
+        bench: &mut Bencher,
+    ) {
+        let codec = gen_bools(0.9f64);
+        let num_vals = codec.num_non_null_vals();
+        bench.iter(|| {
+            codec
+                .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
+                .last()
        });
    }

    #[bench]
-    fn bench_dense_codec_translate_dense_to_orig(bench: &mut Bencher) {
-        let codec = gen_bools();
+    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_random_stride(
+        bench: &mut Bencher,
+    ) {
+        let codec = gen_bools(0.9f64);
+        let num_vals = codec.num_non_null_vals();
+        bench.iter(|| {
+            codec
+                .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
+                .last()
+        });
+    }
+
+    #[bench]
+    fn bench_dense_codec_translate_dense_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
+        let codec = gen_bools(0.9f64);
        let num_vals = codec.num_non_null_vals();
        bench.iter(|| {
            codec
--- a/fastfield_codecs/src/null_index/mod.rs
+++ b/fastfield_codecs/src/null_index/mod.rs
@@ -2,10 +2,12 @@ pub use dense::{serialize_dense_codec, DenseCodec};

 mod dense;

+#[inline]
 fn get_bit_at(input: u64, n: u32) -> bool {
    input & (1 << n) != 0
 }

+#[inline]
 fn set_bit_at(input: &mut u64, n: u64) {
    *input |= 1 << n;
 }
--- a/ownedbytes/Cargo.toml
+++ b/ownedbytes/Cargo.toml
@@ -1,10 +1,14 @@
 [package]
 authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 name = "ownedbytes"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2021"
 description = "Expose data as static slice"
 license = "MIT"
+documentation = "https://docs.rs/ownedbytes/"
+homepage = "https://github.com/quickwit-oss/tantivy"
+repository = "https://github.com/quickwit-oss/tantivy"
+
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
--- a/query-grammar/Cargo.toml
+++ b/query-grammar/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy-query-grammar"
-version = "0.18.0"
+version = "0.19.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -447,8 +447,8 @@ impl SegmentUpdater {
            let segment_entries = segment_updater.purge_deletes(opstamp)?;
            segment_updater.segment_manager.commit(segment_entries);
            segment_updater.save_metas(opstamp, payload)?;
-            let _ = garbage_collect_files(segment_updater.clone());
-            segment_updater.consider_merge_options();
+            // let _ = garbage_collect_files(segment_updater.clone());
+            // segment_updater.consider_merge_options();
            Ok(opstamp)
        })
    }
Author	SHA1	Message	Date
ChillFish8	1e50f96fb0	Disable GC and merge checker.	2022-12-11 14:04:20 +00:00
PSeitz	a05a0035f8	Merge pull request #1711 from quickwit-oss/sparse_dense_index add dense codec	2022-12-09 08:48:43 +01:00
Pascal Seitz	976128a412	extend benchmarks	2022-12-09 15:21:25 +08:00
PSeitz	f27b3e312d	Apply suggestions from code review Co-authored-by: Paul Masurel <paul@quickwit.io>	2022-12-09 08:01:56 +01:00
PSeitz	56dea6f08d	Apply suggestions from code review Co-authored-by: Paul Masurel <paul@quickwit.io>	2022-12-09 08:01:02 +01:00
Pascal Seitz	789d29cf45	move code to DenseIndexBlock improve benchmark	2022-12-09 14:18:26 +08:00
Paul Masurel	a36b50d825	benchmark fix and important optimisation	2022-12-08 18:55:20 +09:00
PSeitz	09f65e5467	Merge pull request #1707 from quickwit-oss/bump_version bump version	2022-12-08 09:03:47 +01:00
Pascal Seitz	11b01e4141	chore: Release	2022-12-02 16:45:18 +08:00
Pascal Seitz	3e8852c606	revert tant version	2022-12-02 16:44:34 +08:00
Pascal Seitz	725f1ecb80	update cargo.toml	2022-12-02 16:43:17 +08:00
Pascal Seitz	afa27afe7d	group workspace deps	2022-12-02 16:31:30 +08:00