clippy

refactor postings (#2709 )
rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings
2026-01-04 00:02:55 +00:00 · 2025-10-08 17:07:07 +02:00 · 2025-10-08 16:55:25 +02:00 · 2025-10-08 16:47:09 +02:00 · 2025-09-24 10:58:46 +02:00 · 2025-09-22 16:32:49 +02:00
151 changed files with 3540 additions and 1511 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,34 @@
-Tantivy 0.23 - Unreleased
+Tantivy 0.25
 ================================
-Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75.
+
+## Bugfixes
+- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz)
+- make zstd optional in sstable [#2633](https://github.com/quickwit-oss/tantivy/pull/2633)(@Parth)
+- Fix TopDocs::order_by_string_fast_field for asc order [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
+
+## Features/Improvements
+- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/tantivy/pull/2660)(@PSeitz)
+- Add string fast field support to `TopDocs`. [#2642](https://github.com/quickwit-oss/tantivy/pull/2642)(@stuhood)
+- update edition to 2024 [#2620](https://github.com/quickwit-oss/tantivy/pull/2620)(@PSeitz)
+- Allow optional spaces between the field name and the value in the query parser [#2678](https://github.com/quickwit-oss/tantivy/pull/2678)(@Darkheir)
+- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
+- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
+
+Tantivy 0.24.2
+================================
+- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz) 
+
+Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
+[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
+for `Order::Asc`
+
+Tantivy 0.24.1
+================================
+- Fix: bump required rust version to 1.81
+  
+Tantivy 0.24
+================================
+Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.

 #### Bugfixes
 - fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
@@ -80,6 +108,14 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
 - Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
 - remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)

+Tantivy 0.22.1
+================================
+- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz) 
+
+Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
+[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
+for `Order::Asc`
+
 Tantivy 0.22
 ================================

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.24.2"
+version = "0.25.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 edition = "2021"
-rust-version = "1.81"
+rust-version = "1.85"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
@@ -33,7 +33,7 @@ tempfile = { version = "3.12.0", optional = true }
 log = "0.4.16"
 serde = { version = "1.0.219", features = ["derive"] }
 serde_json = "1.0.140"
-fs4 = { version = "0.8.0", optional = true }
+fs4 = { version = "0.13.1", optional = true }
 levenshtein_automata = "0.2.1"
 uuid = { version = "1.0.0", features = ["v4", "serde"] }
 crossbeam-channel = "0.5.4"
@@ -57,13 +57,13 @@ measure_time = "0.9.0"
 arc-swap = "1.5.0"
 bon = "3.3.1"

-columnar = { version = "0.5", path = "./columnar", package = "tantivy-columnar" }
-sstable = { version = "0.5", path = "./sstable", package = "tantivy-sstable", optional = true }
-stacker = { version = "0.5", path = "./stacker", package = "tantivy-stacker" }
-query-grammar = { version = "0.24.0", path = "./query-grammar", package = "tantivy-query-grammar" }
-tantivy-bitpacker = { version = "0.8", path = "./bitpacker" }
-common = { version = "0.9", path = "./common/", package = "tantivy-common" }
-tokenizer-api = { version = "0.5", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
+columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
+sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
+stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
+query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
+tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
+common = { version = "0.10", path = "./common/", package = "tantivy-common" }
+tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
 sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
 hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
 futures-util = { version = "0.3.28", optional = true }
@@ -112,13 +112,16 @@ debug-assertions = true
 overflow-checks = true

 [features]
-default = ["mmap", "stopwords", "lz4-compression"]
+default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
 mmap = ["fs4", "tempfile", "memmap2"]
 stopwords = []

 lz4-compression = ["lz4_flex"]
 zstd-compression = ["zstd"]

+# enable zstd-compression in columnar (and sstable)
+columnar-zstd-compression = ["columnar/zstd-compression"]
+
 failpoints = ["fail", "fail/failpoints"]
 unstable = []                            # useful for benches.

@@ -164,3 +167,12 @@ harness = false
 [[bench]]
 name = "agg_bench"
 harness = false
+
+[[bench]]
+name = "exists_json"
+harness = false
+
+[[bench]]
+name = "and_or_queries"
+harness = false
+
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,4 @@
-# Release a new Tantivy Version
+# Releasing a new Tantivy Version

 ## Steps

@@ -10,12 +10,29 @@
 6. Set git tag with new version


-In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
-Set new packages to version 0.0.0
+[`cargo-release`](https://github.com/crate-ci/cargo-release) will help us with steps 1-5:

 Replace prev-tag-name
 ```bash
-cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
+cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag
 ```

-no-tag or it will create tags for all the subpackages
+`no-tag` or it will create tags for all the subpackages
+
+cargo release will _not_ ignore unchanged packages, but it will print warnings for them.
+e.g. "warning: updating ownedbytes to 0.10.0 despite no changes made since tag 0.24"
+
+We need to manually ignore these unchanged packages
+```bash
+cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag --exclude tokenizer-api
+```
+
+Add `--execute` to actually publish the packages, otherwise it will only print the commands that would be run.
+
+### Tag Version
+```bash
+git tag 0.25.0
+git push upstream tag 0.25.0
+```
+
+
--- a/benches/and_or_queries.rs
+++ b/benches/and_or_queries.rs
@@ -0,0 +1,224 @@
+// Benchmarks boolean conjunction queries using binggan.
+//
+// What’s measured:
+// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
+// - Nested AND/OR combinations (on multiple fields)
+// - No-scoring path using the Count collector (focus on iterator/skip performance)
+// - Top-K retrieval (k=10) using the TopDocs collector
+//
+// Corpus model:
+// - Synthetic docs; each token a/b/c is independently included per doc
+// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
+//
+// Notes:
+// - After optimization, when scoring is disabled Tantivy reads doc-only postings
+//   (IndexRecordOption::Basic), avoiding frequency decoding overhead.
+// - This bench isolates boolean iteration speed and intersection/union cost.
+// - Use `cargo bench --bench boolean_conjunction` to run.
+
+use binggan::{black_box, BenchRunner};
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
+use tantivy::collector::{Count, TopDocs};
+use tantivy::query::QueryParser;
+use tantivy::schema::{Schema, TEXT};
+use tantivy::{doc, Index, ReloadPolicy, Searcher};
+
+#[derive(Clone)]
+struct BenchIndex {
+    #[allow(dead_code)]
+    index: Index,
+    searcher: Searcher,
+    query_parser: QueryParser,
+}
+
+impl BenchIndex {
+    #[inline(always)]
+    fn count_query(&self, query_str: &str) -> usize {
+        let query = self.query_parser.parse_query(query_str).unwrap();
+        self.searcher.search(&query, &Count).unwrap()
+    }
+
+    #[inline(always)]
+    fn topk_len(&self, query_str: &str, k: usize) -> usize {
+        let query = self.query_parser.parse_query(query_str).unwrap();
+        self.searcher
+            .search(&query, &TopDocs::with_limit(k))
+            .unwrap()
+            .len()
+    }
+}
+
+/// Build a single index containing both fields (title, body) and
+/// return two BenchIndex views:
+/// - single_field: QueryParser defaults to only "body"
+/// - multi_field:  QueryParser defaults to ["title", "body"]
+fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
+    // Unified schema (two text fields)
+    let mut schema_builder = Schema::builder();
+    let f_title = schema_builder.add_text_field("title", TEXT);
+    let f_body = schema_builder.add_text_field("body", TEXT);
+    let schema = schema_builder.build();
+    let index = Index::create_in_ram(schema.clone());
+
+    // Populate index with stable RNG for reproducibility.
+    let mut rng = StdRng::from_seed([7u8; 32]);
+
+    // Populate: spread each present token 90/10 to body/title
+    {
+        let mut writer = index.writer(500_000_000).unwrap();
+        for _ in 0..num_docs {
+            let has_a = rng.gen_bool(p_a as f64);
+            let has_b = rng.gen_bool(p_b as f64);
+            let has_c = rng.gen_bool(p_c as f64);
+            let mut title_tokens: Vec<&str> = Vec::new();
+            let mut body_tokens: Vec<&str> = Vec::new();
+            if has_a {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("a");
+                } else {
+                    body_tokens.push("a");
+                }
+            }
+            if has_b {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("b");
+                } else {
+                    body_tokens.push("b");
+                }
+            }
+            if has_c {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("c");
+                } else {
+                    body_tokens.push("c");
+                }
+            }
+            if title_tokens.is_empty() && body_tokens.is_empty() {
+                body_tokens.push("z");
+            }
+            writer
+                .add_document(doc!(
+                    f_title=>title_tokens.join(" "),
+                    f_body=>body_tokens.join(" ")
+                ))
+                .unwrap();
+        }
+        writer.commit().unwrap();
+    }
+
+    // Prepare reader/searcher once.
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::Manual)
+        .try_into()
+        .unwrap();
+    let searcher = reader.searcher();
+
+    // Build two query parsers with different default fields.
+    let qp_single = QueryParser::for_index(&index, vec![f_body]);
+    let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
+
+    let single_view = BenchIndex {
+        index: index.clone(),
+        searcher: searcher.clone(),
+        query_parser: qp_single,
+    };
+    let multi_view = BenchIndex {
+        index,
+        searcher,
+        query_parser: qp_multi,
+    };
+    (single_view, multi_view)
+}
+
+fn main() {
+    // Prepare corpora with varying selectivity. Build one index per corpus
+    // and derive two views (single-field vs multi-field) from it.
+    let scenarios = vec![
+        (
+            "N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
+            1_000_000,
+            0.05,
+            0.01,
+            0.15,
+        ),
+        (
+            "N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
+            1_000_000,
+            0.01,
+            0.01,
+            0.15,
+        ),
+    ];
+
+    let mut runner = BenchRunner::new();
+    for (label, n, pa, pb, pc) in scenarios {
+        let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
+
+        // Single-field group: default field is body only
+        {
+            let mut group = runner.new_group();
+            group.set_name(format!("single_field — {}", label));
+            group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b"))
+            });
+            group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b +c"))
+            });
+            group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b", 10))
+            });
+            group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b +c", 10))
+            });
+            // OR queries
+            group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b"))
+            });
+            group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b OR c"))
+            });
+            group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b", 10))
+            });
+            group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b OR c", 10))
+            });
+            group.run();
+        }
+
+        // Multi-field group: default fields are [title, body]
+        {
+            let mut group = runner.new_group();
+            group.set_name(format!("multi_field — {}", label));
+            group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b"))
+            });
+            group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b +c"))
+            });
+            group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b", 10))
+            });
+            group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b +c", 10))
+            });
+            // OR queries
+            group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b"))
+            });
+            group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b OR c"))
+            });
+            group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b", 10))
+            });
+            group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b OR c", 10))
+            });
+            group.run();
+        }
+    }
+}
--- a/benches/exists_json.rs
+++ b/benches/exists_json.rs
@@ -0,0 +1,69 @@
+use binggan::plugins::PeakMemAllocPlugin;
+use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
+use serde_json::json;
+use tantivy::collector::Count;
+use tantivy::query::ExistsQuery;
+use tantivy::schema::{Schema, FAST, TEXT};
+use tantivy::{doc, Index};
+
+#[global_allocator]
+pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
+
+fn main() {
+    let doc_count: usize = 500_000;
+    let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
+
+    let indices: Vec<(String, Index)> = subfield_counts
+        .iter()
+        .map(|&sub_fields| {
+            (
+                format!("subfields={sub_fields}"),
+                build_index_with_json_subfields(doc_count, sub_fields),
+            )
+        })
+        .collect();
+
+    let mut group = InputGroup::new_with_inputs(indices);
+    group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
+
+    group.config().num_iter_group = Some(1);
+    group.config().num_iter_bench = Some(1);
+    group.register("exists_json", exists_json_union);
+
+    group.run();
+}
+
+fn exists_json_union(index: &Index) {
+    let reader = index.reader().expect("reader");
+    let searcher = reader.searcher();
+    let query = ExistsQuery::new("json".to_string(), true);
+    let count = searcher.search(&query, &Count).expect("exists search");
+    // Prevents optimizer from eliding the search
+    black_box(count);
+}
+
+fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
+    // Schema: single JSON field stored as FAST to support ExistsQuery.
+    let mut schema_builder = Schema::builder();
+    let json_field = schema_builder.add_json_field("json", TEXT | FAST);
+    let schema = schema_builder.build();
+
+    let index = Index::create_from_tempdir(schema).expect("create index");
+    {
+        let mut index_writer = index
+            .writer_with_num_threads(1, 200_000_000)
+            .expect("writer");
+        for i in 0..num_docs {
+            let sub = i % num_subfields;
+            // Only one subpath set per document; rotate subpaths so that
+            // no single subpath is full, but the union covers all docs.
+            let v = json!({ format!("field_{sub}"): i as u64 });
+            index_writer
+                .add_document(doc!(json_field => v))
+                .expect("add_document");
+        }
+        index_writer.commit().expect("commit");
+    }
+
+    index
+}
--- a/bitpacker/Cargo.toml
+++ b/bitpacker/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "tantivy-bitpacker"
-version = "0.8.0"
-edition = "2021"
+version = "0.9.0"
+edition = "2024"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = []
--- a/bitpacker/src/bitpacker.rs
+++ b/bitpacker/src/bitpacker.rs
@@ -48,7 +48,7 @@ impl BitPacker {

    pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
        if self.mini_buffer_written > 0 {
-            let num_bytes = (self.mini_buffer_written + 7) / 8;
+            let num_bytes = self.mini_buffer_written.div_ceil(8);
            let bytes = self.mini_buffer.to_le_bytes();
            output.write_all(&bytes[..num_bytes])?;
            self.mini_buffer_written = 0;
@@ -138,7 +138,7 @@ impl BitUnpacker {

        // We use `usize` here to avoid overflow issues.
        let end_bit_read = (end_idx as usize) * self.num_bits;
-        let end_byte_read = (end_bit_read + 7) / 8;
+        let end_byte_read = end_bit_read.div_ceil(8);
        assert!(
            end_byte_read <= data.len(),
            "Requested index is out of bounds."
--- a/bitpacker/src/blocked_bitpacker.rs
+++ b/bitpacker/src/blocked_bitpacker.rs
@@ -1,6 +1,6 @@
 use super::bitpacker::BitPacker;
 use super::compute_num_bits;
-use crate::{minmax, BitUnpacker};
+use crate::{BitUnpacker, minmax};

 const BLOCK_SIZE: usize = 128;

@@ -140,10 +140,10 @@ impl BlockedBitpacker {
    pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
        // todo performance: we could decompress a whole block and cache it instead
        let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
-        let iter = (0..bitpacked_elems)
+
+        (0..bitpacked_elems)
            .map(move |idx| self.get(idx))
-            .chain(self.buffer.iter().cloned());
-        iter
+            .chain(self.buffer.iter().cloned())
    }
 }

--- a/bitpacker/src/lib.rs
+++ b/bitpacker/src/lib.rs
@@ -33,11 +33,7 @@ pub use crate::blocked_bitpacker::BlockedBitpacker;
 /// number of bits.
 pub fn compute_num_bits(n: u64) -> u8 {
    let amplitude = (64u32 - n.leading_zeros()) as u8;
-    if amplitude <= 64 - 8 {
-        amplitude
-    } else {
-        64
-    }
+    if amplitude <= 64 - 8 { amplitude } else { 64 }
 }

 /// Computes the (min, max) of an iterator of `PartialOrd` values.
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "tantivy-columnar"
-version = "0.5.0"
-edition = "2021"
+version = "0.6.0"
+edition = "2024"
 license = "MIT"
 homepage = "https://github.com/quickwit-oss/tantivy"
 repository = "https://github.com/quickwit-oss/tantivy"
@@ -12,10 +12,10 @@ categories = ["database-implementations", "data-structures", "compression"]
 itertools = "0.14.0"
 fastdivide = "0.4.0"

-stacker = { version= "0.5", path = "../stacker", package="tantivy-stacker"}
-sstable = { version= "0.5", path = "../sstable", package = "tantivy-sstable" }
-common = { version= "0.9", path = "../common", package = "tantivy-common" }
-tantivy-bitpacker = { version= "0.8", path = "../bitpacker/" }
+stacker = { version= "0.6", path = "../stacker", package="tantivy-stacker"}
+sstable = { version= "0.6", path = "../sstable", package = "tantivy-sstable" }
+common = { version= "0.10", path = "../common", package = "tantivy-common" }
+tantivy-bitpacker = { version= "0.9", path = "../bitpacker/" }
 serde = "1.0.152"
 downcast-rs = "2.0.1"

@@ -33,6 +33,29 @@ harness = false
 name = "bench_access"
 harness = false

+[[bench]]
+name = "bench_first_vals"
+harness = false
+
+[[bench]]
+name = "bench_values_u64"
+harness = false
+
+[[bench]]
+name = "bench_values_u128"
+harness = false
+
+[[bench]]
+name = "bench_create_column_values"
+harness = false
+
+[[bench]]
+name = "bench_column_values_get"
+harness = false
+
+[[bench]]
+name = "bench_optional_index"
+harness = false

 [features]
-unstable = []
+zstd-compression = ["sstable/zstd-compression"]
--- a/columnar/benches/bench_access.rs
+++ b/columnar/benches/bench_access.rs
@@ -1,4 +1,4 @@
-use binggan::{black_box, InputGroup};
+use binggan::{InputGroup, black_box};
 use common::*;
 use tantivy_columnar::Column;

@@ -19,7 +19,7 @@ fn main() {

    let mut add_card = |card1: Card| {
        inputs.push((
-            format!("{card1}"),
+            card1.to_string(),
            generate_columnar_and_open(card1, NUM_DOCS),
        ));
    };
@@ -50,6 +50,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
        let mut buffer = vec![None; BLOCK_SIZE];
        for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
            // fill docs
+            #[allow(clippy::needless_range_loop)]
            for idx in 0..BLOCK_SIZE {
                docs[idx] = idx as u32 + i;
            }
--- a/columnar/benches/bench_column_values_get.rs
+++ b/columnar/benches/bench_column_values_get.rs
@@ -0,0 +1,61 @@
+use std::sync::Arc;
+
+use binggan::{InputGroup, black_box};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tantivy_columnar::ColumnValues;
+use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
+
+fn get_data() -> Vec<u64> {
+    let mut rng = StdRng::seed_from_u64(2u64);
+    let mut data: Vec<_> = (100..55_000_u64)
+        .map(|num| num + rng.r#gen::<u8>() as u64)
+        .collect();
+    data.push(99_000);
+    data.insert(1000, 2000);
+    data.insert(2000, 100);
+    data.insert(3000, 4100);
+    data.insert(4000, 100);
+    data.insert(5000, 800);
+    data
+}
+
+#[inline(never)]
+fn value_iter() -> impl Iterator<Item = u64> {
+    0..20_000
+}
+
+type Col = Arc<dyn ColumnValues<u64>>;
+
+fn main() {
+    let data = get_data();
+    let inputs: Vec<(String, Col)> = vec![
+        (
+            "bitpacked".to_string(),
+            serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Bitpacked]),
+        ),
+        (
+            "linear".to_string(),
+            serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Linear]),
+        ),
+        (
+            "blockwise_linear".to_string(),
+            serialize_and_load_u64_based_column_values(
+                &data.as_slice(),
+                &[CodecType::BlockwiseLinear],
+            ),
+        ),
+    ];
+
+    let mut group: InputGroup<Col> = InputGroup::new_with_inputs(inputs);
+
+    group.register("fastfield_get", |col: &Col| {
+        let mut sum = 0u64;
+        for pos in value_iter() {
+            sum = sum.wrapping_add(col.get_val(pos as u32));
+        }
+        black_box(sum);
+    });
+
+    group.run();
+}
--- a/columnar/benches/bench_create_column_values.rs
+++ b/columnar/benches/bench_create_column_values.rs
@@ -0,0 +1,44 @@
+use binggan::{InputGroup, black_box};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_values};
+
+fn get_data() -> Vec<u64> {
+    let mut rng = StdRng::seed_from_u64(2u64);
+    let mut data: Vec<_> = (100..55_000_u64)
+        .map(|num| num + rng.r#gen::<u8>() as u64)
+        .collect();
+    data.push(99_000);
+    data.insert(1000, 2000);
+    data.insert(2000, 100);
+    data.insert(3000, 4100);
+    data.insert(4000, 100);
+    data.insert(5000, 800);
+    data
+}
+
+fn main() {
+    let data = get_data();
+    let mut group: InputGroup<(CodecType, Vec<u64>)> = InputGroup::new_with_inputs(vec![
+        (
+            "bitpacked codec".to_string(),
+            (CodecType::Bitpacked, data.clone()),
+        ),
+        (
+            "linear codec".to_string(),
+            (CodecType::Linear, data.clone()),
+        ),
+        (
+            "blockwise linear codec".to_string(),
+            (CodecType::BlockwiseLinear, data.clone()),
+        ),
+    ]);
+
+    group.register("serialize column_values", |data| {
+        let mut buffer = Vec::new();
+        serialize_u64_based_column_values(&data.1.as_slice(), &[data.0], &mut buffer).unwrap();
+        black_box(buffer.len());
+    });
+
+    group.run();
+}
--- a/columnar/benches/bench_first_vals.rs
+++ b/columnar/benches/bench_first_vals.rs
@@ -1,12 +1,9 @@
-#![feature(test)]
-extern crate test;
-
 use std::sync::Arc;

+use binggan::{InputGroup, black_box};
 use rand::prelude::*;
-use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
+use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
 use tantivy_columnar::*;
-use test::{black_box, Bencher};

 struct Columns {
    pub optional: Column,
@@ -68,88 +65,45 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
    serialize_and_load_u64_based_column_values(&column, &[codec_type])
 }

-fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
-    let num_iter = black_box(NUM_VALUES);
-    b.iter(|| {
+fn main() {
+    let Columns {
+        optional,
+        full,
+        multi,
+    } = get_test_columns();
+
+    let inputs = vec![
+        ("full".to_string(), full),
+        ("optional".to_string(), optional),
+        ("multi".to_string(), multi),
+    ];
+
+    let mut group = InputGroup::new_with_inputs(inputs);
+
+    group.register("first_full_scan", |column| {
        let mut sum = 0u64;
-        for i in 0..num_iter as u32 {
+        for i in 0..NUM_VALUES as u32 {
            let val = column.first(i);
            sum += val.unwrap_or(0);
        }
-        sum
+        black_box(sum);
    });
-}
-fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
-    let mut block: Vec<Option<u64>> = vec![None; 64];
-    let fetch_docids = (0..64).collect::<Vec<_>>();
-    b.iter(move || {
+
+    group.register("first_block_fetch", |column| {
+        let mut block: Vec<Option<u64>> = vec![None; 64];
+        let fetch_docids = (0..64).collect::<Vec<_>>();
        column.first_vals(&fetch_docids, &mut block);
-        block[0]
+        black_box(block[0]);
    });
-}
-fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
-    let mut block: Vec<Option<u64>> = vec![None; 64];
-    let fetch_docids = (0..64).collect::<Vec<_>>();
-    b.iter(move || {
+
+    group.register("first_block_single_calls", |column| {
+        let mut block: Vec<Option<u64>> = vec![None; 64];
+        let fetch_docids = (0..64).collect::<Vec<_>>();
        for i in 0..fetch_docids.len() {
            block[i] = column.first(fetch_docids[i]);
        }
-        block[0]
+        black_box(block[0]);
    });
-}

-/// Column first method
-#[bench]
-fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
-    let column = get_test_columns().full;
-    run_bench_on_column_full_scan(b, column);
-}
-
-#[bench]
-fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
-    let column = get_test_columns().optional;
-    run_bench_on_column_full_scan(b, column);
-}
-
-#[bench]
-fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
-    let column = get_test_columns().multi;
-    run_bench_on_column_full_scan(b, column);
-}
-
-/// Block fetch column accessor
-#[bench]
-fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
-    let column = get_test_columns().optional;
-    run_bench_on_column_block_fetch(b, column);
-}
-
-#[bench]
-fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
-    let column = get_test_columns().multi;
-    run_bench_on_column_block_fetch(b, column);
-}
-
-#[bench]
-fn bench_get_block_first_on_full_column(b: &mut Bencher) {
-    let column = get_test_columns().full;
-    run_bench_on_column_block_fetch(b, column);
-}
-
-#[bench]
-fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
-    let column = get_test_columns().optional;
-    run_bench_on_column_block_single_calls(b, column);
-}
-
-#[bench]
-fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
-    let column = get_test_columns().multi;
-    run_bench_on_column_block_single_calls(b, column);
-}
-
-#[bench]
-fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
-    let column = get_test_columns().full;
-    run_bench_on_column_block_single_calls(b, column);
+    group.run();
 }
--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -1,7 +1,7 @@
 pub mod common;

 use binggan::BenchRunner;
-use common::{generate_columnar_with_name, Card};
+use common::{Card, generate_columnar_with_name};
 use tantivy_columnar::*;

 const NUM_DOCS: u32 = 100_000;
--- a/columnar/benches/bench_optional_index.rs
+++ b/columnar/benches/bench_optional_index.rs
@@ -0,0 +1,106 @@
+use binggan::{InputGroup, black_box};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tantivy_columnar::column_index::{OptionalIndex, Set};
+
+const TOTAL_NUM_VALUES: u32 = 1_000_000;
+
+fn gen_optional_index(fill_ratio: f64) -> OptionalIndex {
+    let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
+    let vals: Vec<u32> = (0..TOTAL_NUM_VALUES)
+        .map(|_| rng.gen_bool(fill_ratio))
+        .enumerate()
+        .filter(|(_pos, val)| *val)
+        .map(|(pos, _)| pos as u32)
+        .collect();
+    OptionalIndex::for_test(TOTAL_NUM_VALUES, &vals)
+}
+
+fn random_range_iterator(
+    start: u32,
+    end: u32,
+    avg_step_size: u32,
+    avg_deviation: u32,
+) -> impl Iterator<Item = u32> {
+    let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
+    let mut current = start;
+    std::iter::from_fn(move || {
+        current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
+        if current >= end { None } else { Some(current) }
+    })
+}
+
+fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
+    let ratio = percent / 100.0;
+    let step_size = (1f32 / ratio) as u32;
+    let deviation = step_size - 1;
+    random_range_iterator(0, num_values, step_size, deviation)
+}
+
+fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
+    walk_over_data_from_positions(
+        codec,
+        random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
+    )
+}
+
+fn walk_over_data_from_positions(
+    codec: &OptionalIndex,
+    positions: impl Iterator<Item = u32>,
+) -> Option<u32> {
+    let mut dense_idx: Option<u32> = None;
+    for idx in positions {
+        dense_idx = dense_idx.or(codec.rank_if_exists(idx));
+    }
+    dense_idx
+}
+
+fn main() {
+    // Build separate inputs for each fill ratio.
+    let inputs: Vec<(String, OptionalIndex)> = vec![
+        ("fill=1%".to_string(), gen_optional_index(0.01)),
+        ("fill=5%".to_string(), gen_optional_index(0.05)),
+        ("fill=10%".to_string(), gen_optional_index(0.10)),
+        ("fill=50%".to_string(), gen_optional_index(0.50)),
+        ("fill=90%".to_string(), gen_optional_index(0.90)),
+    ];
+
+    let mut group: InputGroup<OptionalIndex> = InputGroup::new_with_inputs(inputs);
+
+    // Translate orig->codec (rank_if_exists) with sampling
+    group.register("orig_to_codec_10pct_hit", |codec: &OptionalIndex| {
+        black_box(walk_over_data(codec, 100));
+    });
+    group.register("orig_to_codec_1pct_hit", |codec: &OptionalIndex| {
+        black_box(walk_over_data(codec, 1000));
+    });
+    group.register("orig_to_codec_full_scan", |codec: &OptionalIndex| {
+        black_box(walk_over_data_from_positions(codec, 0..TOTAL_NUM_VALUES));
+    });
+
+    // Translate codec->orig (select/select_batch) on sampled ranks
+    fn bench_translate_codec_to_orig_util(codec: &OptionalIndex, percent_hit: f32) {
+        let num_non_nulls = codec.num_non_nulls();
+        let idxs: Vec<u32> = if percent_hit == 100.0f32 {
+            (0..num_non_nulls).collect()
+        } else {
+            n_percent_step_iterator(percent_hit, num_non_nulls).collect()
+        };
+        let mut output = vec![0u32; idxs.len()];
+        output.copy_from_slice(&idxs[..]);
+        codec.select_batch(&mut output);
+        black_box(output);
+    }
+
+    group.register("codec_to_orig_0.005pct_hit", |codec: &OptionalIndex| {
+        bench_translate_codec_to_orig_util(codec, 0.005);
+    });
+    group.register("codec_to_orig_10pct_hit", |codec: &OptionalIndex| {
+        bench_translate_codec_to_orig_util(codec, 10.0);
+    });
+    group.register("codec_to_orig_full_scan", |codec: &OptionalIndex| {
+        bench_translate_codec_to_orig_util(codec, 100.0);
+    });
+
+    group.run();
+}
--- a/columnar/benches/bench_values_u128.rs
+++ b/columnar/benches/bench_values_u128.rs
@@ -1,15 +1,12 @@
-#![feature(test)]
-
 use std::ops::RangeInclusive;
 use std::sync::Arc;

+use binggan::{InputGroup, black_box};
 use common::OwnedBytes;
 use rand::rngs::StdRng;
 use rand::seq::SliceRandom;
-use rand::{random, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, random};
 use tantivy_columnar::ColumnValues;
-use test::Bencher;
-extern crate test;

 // TODO does this make sense for IPv6 ?
 fn generate_random() -> Vec<u64> {
@@ -47,78 +44,77 @@ fn get_data_50percent_item() -> Vec<u128> {
    }
    data.push(SINGLE_ITEM);
    data.shuffle(&mut rng);
-    let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
-    data
+    data.iter().map(|el| *el as u128).collect::<Vec<_>>()
 }

-#[bench]
-fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
+fn main() {
    let data = get_data_50percent_item();
-    let column = get_u128_column_from_data(&data);
+    let column_range = get_u128_column_from_data(&data);
+    let column_random = get_u128_column_random();

-    b.iter(|| {
+    struct Inputs {
+        data: Vec<u128>,
+        column_range: Arc<dyn ColumnValues<u128>>,
+        column_random: Arc<dyn ColumnValues<u128>>,
+    }
+
+    let inputs = Inputs {
+        data,
+        column_range,
+        column_random,
+    };
+    let mut group: InputGroup<Inputs> =
+        InputGroup::new_with_inputs(vec![("u128 benches".to_string(), inputs)]);
+
+    group.register(
+        "intfastfield_getrange_u128_50percent_hit",
+        |inp: &Inputs| {
+            let mut positions = Vec::new();
+            inp.column_range.get_row_ids_for_value_range(
+                *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
+                0..inp.data.len() as u32,
+                &mut positions,
+            );
+            black_box(positions.len());
+        },
+    );
+
+    group.register("intfastfield_getrange_u128_single_hit", |inp: &Inputs| {
        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(
-            *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
-            0..data.len() as u32,
-            &mut positions,
-        );
-        positions
-    });
-}
-
-#[bench]
-fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let column = get_u128_column_from_data(&data);
-
-    b.iter(|| {
-        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(
+        inp.column_range.get_row_ids_for_value_range(
            *SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
-            0..data.len() as u32,
+            0..inp.data.len() as u32,
            &mut positions,
        );
-        positions
+        black_box(positions.len());
    });
-}

-#[bench]
-fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let column = get_u128_column_from_data(&data);
-
-    b.iter(|| {
+    group.register("intfastfield_getrange_u128_hit_all", |inp: &Inputs| {
        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
-        positions
+        inp.column_range.get_row_ids_for_value_range(
+            0..=u128::MAX,
+            0..inp.data.len() as u32,
+            &mut positions,
+        );
+        black_box(positions.len());
    });
-}
-// U128 RANGE END

-#[bench]
-fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {
-    let column = get_u128_column_random();
-
-    b.iter(|| {
+    group.register("intfastfield_scan_all_fflookup_u128", |inp: &Inputs| {
        let mut a = 0u128;
-        for i in 0u64..column.num_vals() as u64 {
-            a += column.get_val(i as u32);
+        for i in 0u64..inp.column_random.num_vals() as u64 {
+            a += inp.column_random.get_val(i as u32);
        }
-        a
+        black_box(a);
    });
-}

-#[bench]
-fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) {
-    let column = get_u128_column_random();
-
-    b.iter(|| {
-        let n = column.num_vals();
+    group.register("intfastfield_jumpy_stride5_u128", |inp: &Inputs| {
+        let n = inp.column_random.num_vals();
        let mut a = 0u128;
        for i in (0..n / 5).map(|val| val * 5) {
-            a += column.get_val(i);
+            a += inp.column_random.get_val(i);
        }
-        a
+        black_box(a);
    });
+
+    group.run();
 }
--- a/columnar/benches/bench_values_u64.rs
+++ b/columnar/benches/bench_values_u64.rs
@@ -1,13 +1,10 @@
-#![feature(test)]
-extern crate test;
-
 use std::ops::RangeInclusive;
 use std::sync::Arc;

+use binggan::{InputGroup, black_box};
 use rand::prelude::*;
-use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
+use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
 use tantivy_columnar::*;
-use test::Bencher;

 // Warning: this generates the same permutation at each call
 fn generate_permutation() -> Vec<u64> {
@@ -27,37 +24,11 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
    serialize_and_load_u64_based_column_values(&column, &[codec_type])
 }

-#[bench]
-fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
-    let permutation = generate_permutation();
-    let n = permutation.len();
-    b.iter(|| {
-        let mut a = 0u64;
-        for _ in 0..n {
-            a = permutation[a as usize];
-        }
-        a
-    });
-}
-
-#[bench]
-fn bench_intfastfield_jumpy_fflookup_bitpacked(b: &mut Bencher) {
-    let permutation = generate_permutation();
-    let n = permutation.len();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
-    b.iter(|| {
-        let mut a = 0u64;
-        for _ in 0..n {
-            a = column.get_val(a as u32);
-        }
-        a
-    });
-}
-
 const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
 const SINGLE_ITEM: u64 = 90;
 const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
 const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
+
 fn get_data_50percent_item() -> Vec<u128> {
    let mut rng = StdRng::from_seed([1u8; 32]);

@@ -69,135 +40,122 @@ fn get_data_50percent_item() -> Vec<u128> {
    data.push(SINGLE_ITEM);

    data.shuffle(&mut rng);
-    let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
-    data
+    data.iter().map(|el| *el as u128).collect::<Vec<_>>()
 }

-// U64 RANGE START
-#[bench]
-fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
-    b.iter(|| {
-        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(
-            FIFTY_PERCENT_RANGE,
-            0..data.len() as u32,
-            &mut positions,
-        );
-        positions
-    });
-}
+type VecCol = (Vec<u64>, Arc<dyn ColumnValues<u64>>);

-#[bench]
-fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
-
-    b.iter(|| {
-        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(
-            ONE_PERCENT_ITEM_RANGE,
-            0..data.len() as u32,
-            &mut positions,
-        );
-        positions
-    });
-}
-
-#[bench]
-fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
-
-    b.iter(|| {
-        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
-        positions
-    });
-}
-
-#[bench]
-fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
-    let data = get_data_50percent_item();
-    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
-
-    b.iter(|| {
-        let mut positions = Vec::new();
-        column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
-        positions
-    });
-}
-// U64 RANGE END
-
-#[bench]
-fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
+fn bench_access() {
    let permutation = generate_permutation();
-    let n = permutation.len();
-    b.iter(|| {
+    let column_perm: Arc<dyn ColumnValues<u64>> =
+        serialize_and_load(&permutation, CodecType::Bitpacked);
+
+    let permutation_gcd = generate_permutation_gcd();
+    let column_perm_gcd: Arc<dyn ColumnValues<u64>> =
+        serialize_and_load(&permutation_gcd, CodecType::Bitpacked);
+
+    let mut group: InputGroup<VecCol> = InputGroup::new_with_inputs(vec![
+        (
+            "access".to_string(),
+            (permutation.clone(), column_perm.clone()),
+        ),
+        (
+            "access_gcd".to_string(),
+            (permutation_gcd.clone(), column_perm_gcd.clone()),
+        ),
+    ]);
+
+    group.register("stride7_vec", |inp: &VecCol| {
+        let n = inp.0.len();
        let mut a = 0u64;
        for i in (0..n / 7).map(|val| val * 7) {
-            a += permutation[i as usize];
+            a += inp.0[i];
        }
-        a
+        black_box(a);
    });
-}

-#[bench]
-fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
-    let permutation = generate_permutation();
-    let n = permutation.len();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
-    b.iter(|| {
-        let mut a = 0;
+    group.register("fullscan_vec", |inp: &VecCol| {
+        let mut a = 0u64;
+        for i in 0..inp.0.len() {
+            a += inp.0[i];
+        }
+        black_box(a);
+    });
+
+    group.register("stride7_column_values", |inp: &VecCol| {
+        let n = inp.1.num_vals() as usize;
+        let mut a = 0u64;
        for i in (0..n / 7).map(|val| val * 7) {
-            a += column.get_val(i as u32);
+            a += inp.1.get_val(i as u32);
        }
-        a
+        black_box(a);
    });
-}

-#[bench]
-fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
-    let permutation = generate_permutation();
-    let n = permutation.len();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
-    let column_ref = column.as_ref();
-    b.iter(|| {
-        let mut a = 0u64;
-        for i in 0u32..n as u32 {
-            a += column_ref.get_val(i);
-        }
-        a
-    });
-}
-
-#[bench]
-fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
-    let permutation = generate_permutation_gcd();
-    let n = permutation.len();
-    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
-    b.iter(|| {
+    group.register("fullscan_column_values", |inp: &VecCol| {
        let mut a = 0u64;
+        let n = inp.1.num_vals() as usize;
        for i in 0..n {
-            a += column.get_val(i as u32);
+            a += inp.1.get_val(i as u32);
        }
-        a
+        black_box(a);
    });
+
+    group.run();
 }

-#[bench]
-fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
-    let permutation = generate_permutation();
-    b.iter(|| {
-        let mut a = 0u64;
-        for i in 0..permutation.len() {
-            a += permutation[i as usize] as u64;
-        }
-        a
-    });
+fn bench_range() {
+    let data_50 = get_data_50percent_item();
+    let data_u64 = data_50.iter().map(|el| *el as u64).collect::<Vec<_>>();
+    let column_data: Arc<dyn ColumnValues<u64>> =
+        serialize_and_load(&data_u64, CodecType::Bitpacked);
+
+    let mut group: InputGroup<Arc<dyn ColumnValues<u64>>> =
+        InputGroup::new_with_inputs(vec![("dist_50pct_item".to_string(), column_data.clone())]);
+
+    group.register(
+        "fastfield_getrange_u64_50percent_hit",
+        |col: &Arc<dyn ColumnValues<u64>>| {
+            let mut positions = Vec::new();
+            col.get_row_ids_for_value_range(FIFTY_PERCENT_RANGE, 0..col.num_vals(), &mut positions);
+            black_box(positions.len());
+        },
+    );
+
+    group.register(
+        "fastfield_getrange_u64_1percent_hit",
+        |col: &Arc<dyn ColumnValues<u64>>| {
+            let mut positions = Vec::new();
+            col.get_row_ids_for_value_range(
+                ONE_PERCENT_ITEM_RANGE,
+                0..col.num_vals(),
+                &mut positions,
+            );
+            black_box(positions.len());
+        },
+    );
+
+    group.register(
+        "fastfield_getrange_u64_single_hit",
+        |col: &Arc<dyn ColumnValues<u64>>| {
+            let mut positions = Vec::new();
+            col.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..col.num_vals(), &mut positions);
+            black_box(positions.len());
+        },
+    );
+
+    group.register(
+        "fastfield_getrange_u64_hit_all",
+        |col: &Arc<dyn ColumnValues<u64>>| {
+            let mut positions = Vec::new();
+            col.get_row_ids_for_value_range(0..=u64::MAX, 0..col.num_vals(), &mut positions);
+            black_box(positions.len());
+        },
+    );
+
+    group.run();
+}
+
+fn main() {
+    bench_access();
+    bench_range();
 }
--- a/columnar/src/block_accessor.rs
+++ b/columnar/src/block_accessor.rs
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
        &'a self,
        docs: &'a [u32],
        accessor: &Column<T>,
-    ) -> impl Iterator<Item = (DocId, T)> + 'a {
+    ) -> impl Iterator<Item = (DocId, T)> + 'a + use<'a, T> {
        if accessor.index.get_cardinality().is_full() {
            docs.iter().cloned().zip(self.val_cache.iter().cloned())
        } else {
--- a/columnar/src/column/dictionary_encoded.rs
+++ b/columnar/src/column/dictionary_encoded.rs
@@ -4,8 +4,8 @@ use std::{fmt, io};

 use sstable::{Dictionary, VoidSSTable};

-use crate::column::Column;
 use crate::RowId;
+use crate::column::Column;

 /// Dictionary encoded column.
 ///
--- a/columnar/src/column/mod.rs
+++ b/columnar/src/column/mod.rs
@@ -9,13 +9,14 @@ use std::sync::Arc;
 use common::BinarySerializable;
 pub use dictionary_encoded::{BytesColumn, StrColumn};
 pub use serialize::{
-    open_column_bytes, open_column_str, open_column_u128, open_column_u128_as_compact_u64,
-    open_column_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
+    open_column_bytes, open_column_str, open_column_u64, open_column_u128,
+    open_column_u128_as_compact_u64, serialize_column_mappable_to_u64,
+    serialize_column_mappable_to_u128,
 };

 use crate::column_index::{ColumnIndex, Set};
 use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
-use crate::column_values::{monotonic_map_column, ColumnValues};
+use crate::column_values::{ColumnValues, monotonic_map_column};
 use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};

 #[derive(Clone)]
@@ -113,7 +114,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
        }
    }

-    /// Translates a block of docis to row_ids.
+    /// Translates a block of docids to row_ids.
    ///
    /// returns the row_ids and the matching docids on the same index
    /// e.g.
--- a/columnar/src/column/serialize.rs
+++ b/columnar/src/column/serialize.rs
@@ -6,10 +6,10 @@ use common::OwnedBytes;
 use sstable::Dictionary;

 use crate::column::{BytesColumn, Column};
-use crate::column_index::{serialize_column_index, SerializableColumnIndex};
+use crate::column_index::{SerializableColumnIndex, serialize_column_index};
 use crate::column_values::{
+    CodecType, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
    load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values,
-    CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
 };
 use crate::iterable::Iterable;
 use crate::{StrColumn, Version};
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -99,9 +99,9 @@ mod tests {

    use crate::column_index::merge::detect_cardinality;
    use crate::column_index::multivalued_index::{
-        open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
+        MultiValueIndex, open_multivalued_index, serialize_multivalued_index,
    };
-    use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
+    use crate::column_index::{OptionalIndex, SerializableColumnIndex, merge_column_index};
    use crate::{
        Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
    };
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -137,8 +137,8 @@ impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::column_index::OptionalIndex;
    use crate::RowAddr;
+    use crate::column_index::OptionalIndex;

    #[test]
    fn test_integrate_num_vals_empty() {
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;

+use crate::column_index::SerializableColumnIndex;
 use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
 use crate::column_index::serialize::SerializableOptionalIndex;
-use crate::column_index::SerializableColumnIndex;
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};

@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
        ColumnIndex::Full => Box::new(doc_range),
        ColumnIndex::Optional(optional_index) => Box::new(
            optional_index
-                .iter_docs()
+                .iter_non_null_docs()
                .map(move |row| row + doc_range.start),
        ),
        ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
            MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
                multivalued_index
                    .optional_index
-                    .iter_docs()
+                    .iter_non_null_docs()
                    .map(move |row| row + doc_range.start),
            ),
        },
@@ -105,10 +105,11 @@ fn get_num_values_iterator<'a>(
 ) -> Box<dyn Iterator<Item = u32> + 'a> {
    match column_index {
        ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
-        ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
-        ColumnIndex::Optional(optional_index) => {
-            Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
-        }
+        ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)),
+        ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n(
+            1u32,
+            optional_index.num_non_nulls() as usize,
+        )),
        ColumnIndex::Multivalued(multivalued_index) => Box::new(
            multivalued_index
                .get_start_index_column()
@@ -177,7 +178,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
                        ColumnIndex::Full => Box::new(columnar_row_range),
                        ColumnIndex::Optional(optional_index) => Box::new(
                            optional_index
-                                .iter_docs()
+                                .iter_non_null_docs()
                                .map(move |row_id: RowId| columnar_row_range.start + row_id),
                        ),
                        ColumnIndex::Multivalued(_) => {
--- a/columnar/src/column_index/mod.rs
+++ b/columnar/src/column_index/mod.rs
@@ -14,7 +14,7 @@ pub use merge::merge_column_index;
 pub(crate) use multivalued_index::SerializableMultivalueIndex;
 pub use optional_index::{OptionalIndex, Set};
 pub use serialize::{
-    open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
+    SerializableColumnIndex, SerializableOptionalIndex, open_column_index, serialize_column_index,
 };

 use crate::column_index::multivalued_index::MultiValueIndex;
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -8,7 +8,7 @@ use common::{CountingWriter, OwnedBytes};
 use super::optional_index::{open_optional_index, serialize_optional_index};
 use super::{OptionalIndex, SerializableOptionalIndex, Set};
 use crate::column_values::{
-    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
+    CodecType, ColumnValues, load_u64_based_column_values, serialize_u64_based_column_values,
 };
 use crate::iterable::Iterable;
 use crate::{DocId, RowId, Version};
@@ -215,6 +215,32 @@ impl MultiValueIndex {
        }
    }

+    /// Returns an iterator over document ids that have at least one value.
+    pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => {
+                let mut doc: DocId = 0u32;
+                let num_docs = idx.num_docs();
+                Box::new(std::iter::from_fn(move || {
+                    // This is not the most efficient way to do this, but it's legacy code.
+                    while doc < num_docs {
+                        let cur = doc;
+                        doc += 1;
+                        let start = idx.start_index_column.get_val(cur);
+                        let end = idx.start_index_column.get_val(cur + 1);
+                        if end > start {
+                            return Some(cur);
+                        }
+                    }
+                    None
+                }))
+            }
+            MultiValueIndex::MultiValueIndexV2(idx) => {
+                Box::new(idx.optional_index.iter_non_null_docs())
+            }
+        }
+    }
+
    /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
    /// docids. Positions are converted inplace to docids.
    ///
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -1,4 +1,4 @@
-use std::io::{self, Write};
+use std::io;
 use std::sync::Arc;

 mod set;
@@ -7,11 +7,11 @@ mod set_block;
 use common::{BinarySerializable, OwnedBytes, VInt};
 pub use set::{SelectCursor, Set, SetCodec};
 use set_block::{
-    DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
+    DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec,
 };

 use crate::iterable::Iterable;
-use crate::{DocId, InvalidData, RowId};
+use crate::{DocId, RowId};

 /// The threshold for for number of elements after which we switch to dense block encoding.
 ///
@@ -88,7 +88,7 @@ pub struct OptionalIndex {

 impl Iterable<u32> for &OptionalIndex {
    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
-        Box::new(self.iter_docs())
+        Box::new(self.iter_non_null_docs())
    }
 }

@@ -259,11 +259,13 @@ impl Set<RowId> for OptionalIndex {

 impl OptionalIndex {
    pub fn for_test(num_rows: RowId, row_ids: &[RowId]) -> OptionalIndex {
-        assert!(row_ids
-            .last()
-            .copied()
-            .map(|last_row_id| last_row_id < num_rows)
-            .unwrap_or(true));
+        assert!(
+            row_ids
+                .last()
+                .copied()
+                .map(|last_row_id| last_row_id < num_rows)
+                .unwrap_or(true)
+        );
        let mut buffer = Vec::new();
        serialize_optional_index(&row_ids, num_rows, &mut buffer).unwrap();
        let bytes = OwnedBytes::new(buffer);
@@ -278,8 +280,9 @@ impl OptionalIndex {
        self.num_non_null_docs
    }

-    pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
-        // TODO optimize
+    pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
+        // TODO optimize. We could iterate over the blocks directly.
+        // We use the dense value ids and retrieve the doc ids via select.
        let mut select_batch = self.select_cursor();
        (0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
    }
@@ -332,38 +335,6 @@ enum Block<'a> {
    Sparse(SparseBlock<'a>),
 }

-#[derive(Debug, Copy, Clone)]
-enum OptionalIndexCodec {
-    Dense = 0,
-    Sparse = 1,
-}
-
-impl OptionalIndexCodec {
-    fn to_code(self) -> u8 {
-        self as u8
-    }
-
-    fn try_from_code(code: u8) -> Result<Self, InvalidData> {
-        match code {
-            0 => Ok(Self::Dense),
-            1 => Ok(Self::Sparse),
-            _ => Err(InvalidData),
-        }
-    }
-}
-
-impl BinarySerializable for OptionalIndexCodec {
-    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
-        writer.write_all(&[self.to_code()])
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let optional_codec_code = u8::deserialize(reader)?;
-        let optional_codec = Self::try_from_code(optional_codec_code)?;
-        Ok(optional_codec)
-    }
-}
-
 fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
    let is_sparse = is_sparse(block_els.len() as u32);
    if is_sparse {
--- a/columnar/src/column_index/optional_index/set_block/dense.rs
+++ b/columnar/src/column_index/optional_index/set_block/dense.rs
@@ -2,7 +2,7 @@ use std::io::{self, Write};

 use common::BinarySerializable;

-use crate::column_index::optional_index::{SelectCursor, Set, SetCodec, ELEMENTS_PER_BLOCK};
+use crate::column_index::optional_index::{ELEMENTS_PER_BLOCK, SelectCursor, Set, SetCodec};

 #[inline(always)]
 fn get_bit_at(input: u64, n: u16) -> bool {
--- a/columnar/src/column_index/optional_index/set_block/mod.rs
+++ b/columnar/src/column_index/optional_index/set_block/mod.rs
@@ -1,7 +1,7 @@
 mod dense;
 mod sparse;

-pub use dense::{DenseBlock, DenseBlockCodec, DENSE_BLOCK_NUM_BYTES};
+pub use dense::{DENSE_BLOCK_NUM_BYTES, DenseBlock, DenseBlockCodec};
 pub use sparse::{SparseBlock, SparseBlockCodec};

 #[cfg(test)]
--- a/columnar/src/column_index/optional_index/tests.rs
+++ b/columnar/src/column_index/optional_index/tests.rs
@@ -164,7 +164,11 @@ fn test_optional_index_large() {
 fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
    let optional_index = OptionalIndex::for_test(num_rows, row_ids);
    assert_eq!(optional_index.num_docs(), num_rows);
-    assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
+    assert!(
+        optional_index
+            .iter_non_null_docs()
+            .eq(row_ids.iter().copied())
+    );
 }

 #[test]
@@ -219,174 +223,3 @@ fn test_optional_index_for_tests() {
    assert!(!optional_index.contains(3));
    assert_eq!(optional_index.num_docs(), 4);
 }
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-
-    use rand::rngs::StdRng;
-    use rand::{Rng, SeedableRng};
-    use test::Bencher;
-
-    use super::*;
-
-    const TOTAL_NUM_VALUES: u32 = 1_000_000;
-    fn gen_bools(fill_ratio: f64) -> OptionalIndex {
-        let mut out = Vec::new();
-        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
-            .map(|_| rng.gen_bool(fill_ratio))
-            .enumerate()
-            .filter(|(_pos, val)| *val)
-            .map(|(pos, _)| pos as RowId)
-            .collect();
-        serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
-
-        open_optional_index(OwnedBytes::new(out)).unwrap()
-    }
-
-    fn random_range_iterator(
-        start: u32,
-        end: u32,
-        avg_step_size: u32,
-        avg_deviation: u32,
-    ) -> impl Iterator<Item = u32> {
-        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        let mut current = start;
-        std::iter::from_fn(move || {
-            current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
-            if current >= end {
-                None
-            } else {
-                Some(current)
-            }
-        })
-    }
-
-    fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
-        let ratio = percent / 100.0;
-        let step_size = (1f32 / ratio) as u32;
-        let deviation = step_size - 1;
-        random_range_iterator(0, num_values, step_size, deviation)
-    }
-
-    fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
-        walk_over_data_from_positions(
-            codec,
-            random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
-        )
-    }
-
-    fn walk_over_data_from_positions(
-        codec: &OptionalIndex,
-        positions: impl Iterator<Item = u32>,
-    ) -> Option<u32> {
-        let mut dense_idx: Option<u32> = None;
-        for idx in positions {
-            dense_idx = dense_idx.or(codec.rank_if_exists(idx));
-        }
-        dense_idx
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.01f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.05f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.05f64);
-        bench.iter(|| walk_over_data(&codec, 1000));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
-        let codec = gen_bools(0.01f64);
-        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
-        let codec = gen_bools(0.1f64);
-        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) {
-        let codec = gen_bools(0.9f64);
-        bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.1f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.5f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) {
-        let codec = gen_bools(0.9f64);
-        bench.iter(|| walk_over_data(&codec, 100));
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.01f64, 0.005f32, bench);
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_10percent_filled_0comma005percent_hit(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.1f64, 0.005f32, bench);
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.01f64, 10f32, bench);
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.01f64, 100f32, bench);
-    }
-
-    fn bench_translate_codec_to_orig_util(
-        percent_filled: f64,
-        percent_hit: f32,
-        bench: &mut Bencher,
-    ) {
-        let codec = gen_bools(percent_filled);
-        let num_non_nulls = codec.num_non_nulls();
-        let idxs: Vec<u32> = if percent_hit == 100.0f32 {
-            (0..num_non_nulls).collect()
-        } else {
-            n_percent_step_iterator(percent_hit, num_non_nulls).collect()
-        };
-        let mut output = vec![0u32; idxs.len()];
-        bench.iter(|| {
-            output.copy_from_slice(&idxs[..]);
-            codec.select_batch(&mut output);
-        });
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.9f64, 0.005, bench);
-    }
-
-    #[bench]
-    fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
-        bench_translate_codec_to_orig_util(0.9f64, 100.0f32, bench);
-    }
-}
--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -3,11 +3,11 @@ use std::io::Write;

 use common::{CountingWriter, OwnedBytes};

-use super::multivalued_index::SerializableMultivalueIndex;
 use super::OptionalIndex;
+use super::multivalued_index::SerializableMultivalueIndex;
+use crate::column_index::ColumnIndex;
 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
-use crate::column_index::ColumnIndex;
 use crate::iterable::Iterable;
 use crate::{Cardinality, RowId, Version};

--- a/columnar/src/column_values/bench.rs
+++ b/columnar/src/column_values/bench.rs
@@ -1,139 +0,0 @@
-use std::sync::Arc;
-
-use common::OwnedBytes;
-use rand::rngs::StdRng;
-use rand::{Rng, SeedableRng};
-use test::{self, Bencher};
-
-use super::*;
-use crate::column_values::u64_based::*;
-
-fn get_data() -> Vec<u64> {
-    let mut rng = StdRng::seed_from_u64(2u64);
-    let mut data: Vec<_> = (100..55000_u64)
-        .map(|num| num + rng.gen::<u8>() as u64)
-        .collect();
-    data.push(99_000);
-    data.insert(1000, 2000);
-    data.insert(2000, 100);
-    data.insert(3000, 4100);
-    data.insert(4000, 100);
-    data.insert(5000, 800);
-    data
-}
-
-fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
-    let mut stats_collector = StatsCollector::default();
-    for val in vals {
-        stats_collector.collect(val);
-    }
-    stats_collector.stats()
-}
-
-#[inline(never)]
-fn value_iter() -> impl Iterator<Item = u64> {
-    0..20_000
-}
-
-fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
-    let mut bytes = Vec::new();
-    let stats = compute_stats(data.iter().cloned());
-    let mut codec_serializer = Codec::estimator();
-    for val in data {
-        codec_serializer.collect(*val);
-    }
-    codec_serializer
-        .serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
-        .unwrap();
-
-    Codec::load(OwnedBytes::new(bytes)).unwrap()
-}
-
-fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
-    let col = get_reader_for_bench::<Codec>(data);
-    b.iter(|| {
-        let mut sum = 0u64;
-        for pos in value_iter() {
-            let val = col.get_val(pos as u32);
-            sum = sum.wrapping_add(val);
-        }
-        sum
-    });
-}
-
-#[inline(never)]
-fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
-    b.iter(|| {
-        let mut sum = 0u64;
-        for pos in value_iter() {
-            let val = col.get_val(pos as u32);
-            sum = sum.wrapping_add(val);
-        }
-        sum
-    });
-}
-
-fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
-    let col = Arc::new(get_reader_for_bench::<Codec>(data));
-    bench_get_dynamic_helper(b, col);
-}
-fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
-    let stats = compute_stats(data.iter().cloned());
-
-    let mut bytes = Vec::new();
-    b.iter(|| {
-        bytes.clear();
-        let mut codec_serializer = Codec::estimator();
-        for val in data.iter().take(1024) {
-            codec_serializer.collect(*val);
-        }
-
-        codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
-    });
-}
-
-#[bench]
-fn bench_fastfield_bitpack_create(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_create::<BitpackedCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_create::<LinearCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_create::<BlockwiseLinearCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_bitpack_get(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get::<BitpackedCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get_dynamic::<BitpackedCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get::<LinearCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get_dynamic::<LinearCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get::<BlockwiseLinearCodec>(b, &data);
-}
-#[bench]
-fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
-    let data: Vec<_> = get_data();
-    bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
-}
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -26,13 +26,13 @@ mod monotonic_column;

 pub(crate) use merge::MergedColumnValues;
 pub use stats::ColumnStats;
-pub use u128_based::{
-    open_u128_as_compact_u64, open_u128_mapped, serialize_column_values_u128,
-    CompactSpaceU64Accessor,
-};
 pub use u64_based::{
-    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
-    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
+    ALL_U64_CODEC_TYPES, CodecType, load_u64_based_column_values,
+    serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
+};
+pub use u128_based::{
+    CompactSpaceU64Accessor, open_u128_as_compact_u64, open_u128_mapped,
+    serialize_column_values_u128,
 };
 pub use vec_column::VecColumn;

@@ -242,6 +242,3 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
            .get_row_ids_for_value_range(range, doc_id_range, positions)
    }
 }
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench;
--- a/columnar/src/column_values/monotonic_column.rs
+++ b/columnar/src/column_values/monotonic_column.rs
@@ -2,8 +2,8 @@ use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::ops::{Range, RangeInclusive};

-use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
 use crate::ColumnValues;
+use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;

 struct MonotonicMappingColumn<C, T, Input> {
    from_column: C,
@@ -99,10 +99,10 @@ where
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::column_values::VecColumn;
    use crate::column_values::monotonic_mapping::{
        StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
    };
-    use crate::column_values::VecColumn;

    #[test]
    fn test_monotonic_mapping_iter() {
--- a/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
+++ b/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
        let mut covered_space = Vec::with_capacity(self.blanks.len());

        // beginning of the blanks
-        if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
-            if *first_blank_start != 0 {
-                covered_space.push(0..=first_blank_start - 1);
-            }
+        if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start)
+            && *first_blank_start != 0
+        {
+            covered_space.push(0..=first_blank_start - 1);
        }

        // Between the blanks
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
        covered_space.extend(between_blanks);

        // end of the blanks
-        if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
-            if *last_blank_end != u128::MAX {
-                covered_space.push(last_blank_end + 1..=u128::MAX);
-            }
+        if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end)
+            && *last_blank_end != u128::MAX
+        {
+            covered_space.push(last_blank_end + 1..=u128::MAX);
        }

        if covered_space.is_empty() {
--- a/columnar/src/column_values/u128_based/compact_space/mod.rs
+++ b/columnar/src/column_values/u128_based/compact_space/mod.rs
@@ -24,8 +24,8 @@ use build_compact_space::get_compact_space;
 use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
 use tantivy_bitpacker::{BitPacker, BitUnpacker};

-use crate::column_values::ColumnValues;
 use crate::RowId;
+use crate::column_values::ColumnValues;

 /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
 /// blanks depends on the number of blanks.
@@ -653,12 +653,14 @@ mod tests {
            ),
            &[3]
        );
-        assert!(get_positions_for_value_range_helper(
-            &decomp,
-            99998u128..=99998u128,
-            complete_range.clone()
-        )
-        .is_empty());
+        assert!(
+            get_positions_for_value_range_helper(
+                &decomp,
+                99998u128..=99998u128,
+                complete_range.clone()
+            )
+            .is_empty()
+        );
        assert_eq!(
            &get_positions_for_value_range_helper(
                &decomp,
--- a/columnar/src/column_values/u128_based/mod.rs
+++ b/columnar/src/column_values/u128_based/mod.rs
@@ -130,11 +130,11 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
 #[cfg(test)]
 pub(crate) mod tests {
    use super::*;
-    use crate::column_values::u64_based::{
-        serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
-        ALL_U64_CODEC_TYPES,
-    };
    use crate::column_values::CodecType;
+    use crate::column_values::u64_based::{
+        ALL_U64_CODEC_TYPES, serialize_and_load_u64_based_column_values,
+        serialize_u64_based_column_values,
+    };

    #[test]
    fn test_serialize_deserialize_u128_header() {
--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -4,7 +4,7 @@ use std::ops::{Range, RangeInclusive};

 use common::{BinarySerializable, OwnedBytes};
 use fastdivide::DividerU64;
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};

 use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
 use crate::{ColumnValues, RowId};
@@ -23,11 +23,7 @@ const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
    // copied from unstable rust standard library.
    let d = n / q.get();
    let r = n % q.get();
-    if r > 0 {
-        d + 1
-    } else {
-        d
-    }
+    if r > 0 { d + 1 } else { d }
 }

 // The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
@@ -109,7 +105,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {

    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
        let num_bits_per_value = num_bits(stats);
-        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
+        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8))
    }

    fn serialize(
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -4,12 +4,12 @@ use std::{io, iter};

 use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes};
 use fastdivide::DividerU64;
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};

+use crate::MonotonicallyMappableToU64;
 use crate::column_values::u64_based::line::Line;
 use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
 use crate::column_values::{ColumnValues, VecColumn};
-use crate::MonotonicallyMappableToU64;

 const BLOCK_SIZE: u32 = 512u32;

--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -1,13 +1,13 @@
 use std::io;

 use common::{BinarySerializable, OwnedBytes};
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};

-use super::line::Line;
 use super::ColumnValues;
-use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
-use crate::column_values::VecColumn;
+use super::line::Line;
 use crate::RowId;
+use crate::column_values::VecColumn;
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};

 const HALF_SPACE: u64 = u64::MAX / 2;
 const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
        Some(
            stats.num_bytes()
                + linear_params.num_bytes()
-                + (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
+                + (num_bits as u64 * stats.num_rows as u64).div_ceil(8),
        )
    }

--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -17,7 +17,7 @@ pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
 pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
 pub use crate::column_values::u64_based::linear::LinearCodec;
 pub use crate::column_values::u64_based::stats_collector::StatsCollector;
-use crate::column_values::{monotonic_map_column, ColumnStats};
+use crate::column_values::{ColumnStats, monotonic_map_column};
 use crate::iterable::Iterable;
 use crate::{ColumnValues, MonotonicallyMappableToU64};

--- a/columnar/src/column_values/u64_based/stats_collector.rs
+++ b/columnar/src/column_values/u64_based/stats_collector.rs
@@ -2,8 +2,8 @@ use std::num::NonZeroU64;

 use fastdivide::DividerU64;

-use crate::column_values::ColumnStats;
 use crate::RowId;
+use crate::column_values::ColumnStats;

 /// Compute the gcd of two non null numbers.
 ///
@@ -96,8 +96,8 @@ impl StatsCollector {
 mod tests {
    use std::num::NonZeroU64;

-    use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
    use crate::column_values::u64_based::ColumnStats;
+    use crate::column_values::u64_based::stats_collector::{StatsCollector, compute_gcd};

    fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
        let mut stats_collector = StatsCollector::default();
--- a/columnar/src/column_values/u64_based/tests.rs
+++ b/columnar/src/column_values/u64_based/tests.rs
@@ -1,5 +1,6 @@
 use proptest::prelude::*;
 use proptest::{prop_oneof, proptest};
+use rand::Rng;

 #[test]
 fn test_serialize_and_load_simple() {
--- a/columnar/src/columnar/column_type.rs
+++ b/columnar/src/columnar/column_type.rs
@@ -4,8 +4,8 @@ use std::net::Ipv6Addr;

 use serde::{Deserialize, Serialize};

-use crate::value::NumericalType;
 use crate::InvalidData;
+use crate::value::NumericalType;

 /// The column type represents the column type.
 /// Any changes need to be propagated to `COLUMN_TYPES`.
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -10,11 +10,11 @@ use std::sync::Arc;
 pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};

 use super::writer::ColumnarSerializer;
-use crate::column::{serialize_column_mappable_to_u128, serialize_column_mappable_to_u64};
+use crate::column::{serialize_column_mappable_to_u64, serialize_column_mappable_to_u128};
 use crate::column_values::MergedColumnValues;
+use crate::columnar::ColumnarReader;
 use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column;
 use crate::columnar::writer::CompatibleNumericalTypes;
-use crate::columnar::ColumnarReader;
 use crate::dynamic_column::DynamicColumn;
 use crate::{
    BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
@@ -144,16 +144,17 @@ fn merge_column(
            let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
                Vec::with_capacity(columns_to_merge.len());
            for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
-                if let Some(Column { index: idx, values }) =
-                    dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
-                {
-                    column_indexes.push(idx);
-                    column_values.push(Some(values));
-                } else {
-                    column_indexes.push(ColumnIndex::Empty {
-                        num_docs: num_docs_per_column[i],
-                    });
-                    column_values.push(None);
+                match dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic) {
+                    Some(Column { index: idx, values }) => {
+                        column_indexes.push(idx);
+                        column_values.push(Some(values));
+                    }
+                    None => {
+                        column_indexes.push(ColumnIndex::Empty {
+                            num_docs: num_docs_per_column[i],
+                        });
+                        column_values.push(None);
+                    }
                }
            }
            let merged_column_index =
@@ -253,11 +254,13 @@ impl GroupedColumns {
        }
        // At the moment, only the numerical column type category has more than one possible
        // column type.
-        assert!(self
-            .columns
-            .iter()
-            .flatten()
-            .all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
+        assert!(
+            self.columns
+                .iter()
+                .flatten()
+                .all(|el| ColumnTypeCategory::from(el.column_type())
+                    == ColumnTypeCategory::Numerical)
+        );
        merged_numerical_columns_type(self.columns.iter().flatten()).into()
    }
 }
@@ -364,7 +367,7 @@ fn is_empty_after_merge(
                    ColumnIndex::Empty { .. } => true,
                    ColumnIndex::Full => alive_bitset.len() == 0,
                    ColumnIndex::Optional(optional_index) => {
-                        for doc in optional_index.iter_docs() {
+                        for doc in optional_index.iter_non_null_docs() {
                            if alive_bitset.contains(doc) {
                                return false;
                            }
--- a/columnar/src/columnar/merge/term_merger.rs
+++ b/columnar/src/columnar/merge/term_merger.rs
@@ -74,18 +74,19 @@ impl<'a> TermMerger<'a> {
    /// False if there is none.
    pub fn advance(&mut self) -> bool {
        self.advance_segments();
-        if let Some(head) = self.heap.pop() {
-            self.term_streams_with_segment.push(head);
-            while let Some(next_streamer) = self.heap.peek() {
-                if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
-                    break;
+        match self.heap.pop() {
+            Some(head) => {
+                self.term_streams_with_segment.push(head);
+                while let Some(next_streamer) = self.heap.peek() {
+                    if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
+                        break;
+                    }
+                    let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
+                    self.term_streams_with_segment.push(next_heap_it);
                }
-                let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
-                self.term_streams_with_segment.push(next_heap_it);
+                true
            }
-            true
-        } else {
-            false
+            _ => false,
        }
    }

--- a/columnar/src/columnar/merge/tests.rs
+++ b/columnar/src/columnar/merge/tests.rs
@@ -3,7 +3,7 @@ use proptest::collection::vec;
 use proptest::prelude::*;

 use super::*;
-use crate::columnar::{merge_columnar, ColumnarReader, MergeRowOrder, StackMergeOrder};
+use crate::columnar::{ColumnarReader, MergeRowOrder, StackMergeOrder, merge_columnar};
 use crate::{Cardinality, ColumnarWriter, DynamicColumn, HasAssociatedColumnType, RowId};

 fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
--- a/columnar/src/columnar/mod.rs
+++ b/columnar/src/columnar/mod.rs
@@ -5,9 +5,9 @@ mod reader;
 mod writer;

 pub use column_type::{ColumnType, HasAssociatedColumnType};
-pub use format_version::{Version, CURRENT_VERSION};
+pub use format_version::{CURRENT_VERSION, Version};
 #[cfg(test)]
 pub(crate) use merge::ColumnTypeCategory;
-pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
+pub use merge::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, merge_columnar};
 pub use reader::ColumnarReader;
 pub use writer::ColumnarWriter;
--- a/columnar/src/columnar/reader/mod.rs
+++ b/columnar/src/columnar/reader/mod.rs
@@ -1,11 +1,11 @@
 use std::{fmt, io, mem};

+use common::BinarySerializable;
 use common::file_slice::FileSlice;
 use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
-use common::BinarySerializable;
 use sstable::{Dictionary, RangeSSTable};

-use crate::columnar::{format_version, ColumnType};
+use crate::columnar::{ColumnType, format_version};
 use crate::dynamic_column::DynamicColumnHandle;
 use crate::{RowId, Version};

--- a/columnar/src/columnar/writer/column_operation.rs
+++ b/columnar/src/columnar/writer/column_operation.rs
@@ -244,7 +244,7 @@ impl SymbolValue for UnorderedId {

 fn compute_num_bytes_for_u64(val: u64) -> usize {
    let msb = (64u32 - val.leading_zeros()) as usize;
-    (msb + 7) / 8
+    msb.div_ceil(8)
 }

 fn encode_zig_zag(n: i64) -> u64 {
--- a/columnar/src/columnar/writer/column_writers.rs
+++ b/columnar/src/columnar/writer/column_writers.rs
@@ -42,7 +42,7 @@ impl ColumnWriter {
        &self,
        arena: &MemoryArena,
        buffer: &'a mut Vec<u8>,
-    ) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
+    ) -> impl Iterator<Item = ColumnOperation<V>> + 'a + use<'a, V> {
        buffer.clear();
        self.values.read_to_end(arena, buffer);
        let mut cursor: &[u8] = &buffer[..];
@@ -104,9 +104,10 @@ pub(crate) struct NumericalColumnWriter {

 impl NumericalColumnWriter {
    pub fn force_numerical_type(&mut self, numerical_type: NumericalType) {
-        assert!(self
-            .compatible_numerical_types
-            .is_type_accepted(numerical_type));
+        assert!(
+            self.compatible_numerical_types
+                .is_type_accepted(numerical_type)
+        );
        self.compatible_numerical_types = CompatibleNumericalTypes::StaticType(numerical_type);
    }
 }
@@ -211,7 +212,7 @@ impl NumericalColumnWriter {
        self,
        arena: &MemoryArena,
        buffer: &'a mut Vec<u8>,
-    ) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
+    ) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a + use<'a> {
        self.column_writer.operation_iterator(arena, buffer)
    }
 }
@@ -255,7 +256,7 @@ impl StrOrBytesColumnWriter {
        &self,
        arena: &MemoryArena,
        byte_buffer: &'a mut Vec<u8>,
-    ) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
+    ) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a + use<'a> {
        self.column_writer.operation_iterator(arena, byte_buffer)
    }
 }
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -8,13 +8,13 @@ use std::net::Ipv6Addr;

 use column_operation::ColumnOperation;
 pub(crate) use column_writers::CompatibleNumericalTypes;
-use common::json_path_writer::JSON_END_OF_PATH;
 use common::CountingWriter;
+use common::json_path_writer::JSON_END_OF_PATH;
 pub(crate) use serializer::ColumnarSerializer;
 use stacker::{Addr, ArenaHashMap, MemoryArena};

 use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
-use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
+use crate::column_values::{MonotonicallyMappableToU64, MonotonicallyMappableToU128};
 use crate::columnar::column_type::ColumnType;
 use crate::columnar::writer::column_writers::{
    ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
--- a/columnar/src/columnar/writer/serializer.rs
+++ b/columnar/src/columnar/writer/serializer.rs
@@ -3,11 +3,11 @@ use std::io::Write;

 use common::json_path_writer::JSON_END_OF_PATH;
 use common::{BinarySerializable, CountingWriter};
-use sstable::value::RangeValueWriter;
 use sstable::RangeSSTable;
+use sstable::value::RangeValueWriter;

-use crate::columnar::ColumnType;
 use crate::RowId;
+use crate::columnar::ColumnType;

 pub struct ColumnarSerializer<W: io::Write> {
    wrt: CountingWriter<W>,
--- a/columnar/src/columnar/writer/value_index.rs
+++ b/columnar/src/columnar/writer/value_index.rs
@@ -1,6 +1,6 @@
+use crate::RowId;
 use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
 use crate::iterable::Iterable;
-use crate::RowId;

 /// The `IndexBuilder` interprets a sequence of
 /// calls of the form:
@@ -31,12 +31,13 @@ pub struct OptionalIndexBuilder {

 impl OptionalIndexBuilder {
    pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
-        debug_assert!(self
-            .docs
-            .last()
-            .copied()
-            .map(|last_doc| last_doc < num_rows)
-            .unwrap_or(true));
+        debug_assert!(
+            self.docs
+                .last()
+                .copied()
+                .map(|last_doc| last_doc < num_rows)
+                .unwrap_or(true)
+        );
        &self.docs[..]
    }

@@ -48,12 +49,13 @@ impl OptionalIndexBuilder {
 impl IndexBuilder for OptionalIndexBuilder {
    #[inline(always)]
    fn record_row(&mut self, doc: RowId) {
-        debug_assert!(self
-            .docs
-            .last()
-            .copied()
-            .map(|prev_doc| doc > prev_doc)
-            .unwrap_or(true));
+        debug_assert!(
+            self.docs
+                .last()
+                .copied()
+                .map(|prev_doc| doc > prev_doc)
+                .unwrap_or(true)
+        );
        self.docs.push(doc);
    }
 }
--- a/columnar/src/compat_tests.rs
+++ b/columnar/src/compat_tests.rs
@@ -3,8 +3,8 @@ use std::path::PathBuf;
 use itertools::Itertools;

 use crate::{
-    merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
-    CURRENT_VERSION,
+    CURRENT_VERSION, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
+    merge_columnar,
 };

 const NUM_DOCS: u32 = u16::MAX as u32;
--- a/columnar/src/dynamic_column.rs
+++ b/columnar/src/dynamic_column.rs
@@ -6,7 +6,7 @@ use common::file_slice::FileSlice;
 use common::{ByteCount, DateTime, HasLen, OwnedBytes};

 use crate::column::{BytesColumn, Column, StrColumn};
-use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
+use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
 use crate::columnar::ColumnType;
 use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};

--- a/columnar/src/lib.rs
+++ b/columnar/src/lib.rs
@@ -17,15 +17,10 @@
 //!       column.
 //!     - [column_values]: Stores the values of a column in a dense format.

-#![cfg_attr(all(feature = "unstable", test), feature(test))]
-
 #[cfg(test)]
 #[macro_use]
 extern crate more_asserts;

-#[cfg(all(test, feature = "unstable"))]
-extern crate test;
-
 use std::fmt::Display;
 use std::io;

@@ -44,11 +39,11 @@ pub use block_accessor::ColumnBlockAccessor;
 pub use column::{BytesColumn, Column, StrColumn};
 pub use column_index::ColumnIndex;
 pub use column_values::{
-    ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
+    ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
 };
 pub use columnar::{
-    merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
-    MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
+    CURRENT_VERSION, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
+    MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, merge_columnar,
 };
 use sstable::VoidSSTable;
 pub use value::{NumericalType, NumericalValue};
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -716,8 +716,8 @@ fn test_columnar_merging_number_columns() {
 // TODO document edge case: required_columns incompatible with values.

 #[allow(clippy::type_complexity)]
-fn columnar_docs_and_remap(
-) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
+fn columnar_docs_and_remap()
+-> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
    proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
        |columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
            let row_addrs: Vec<RowAddr> = columnars_docs
--- a/columnar/src/value.rs
+++ b/columnar/src/value.rs
@@ -1,3 +1,5 @@
+use std::str::FromStr;
+
 use common::DateTime;

 use crate::InvalidData;
@@ -9,6 +11,23 @@ pub enum NumericalValue {
    F64(f64),
 }

+impl FromStr for NumericalValue {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, ()> {
+        if let Ok(val_i64) = s.parse::<i64>() {
+            return Ok(val_i64.into());
+        }
+        if let Ok(val_u64) = s.parse::<u64>() {
+            return Ok(val_u64.into());
+        }
+        if let Ok(val_f64) = s.parse::<f64>() {
+            return Ok(NumericalValue::from(val_f64).normalize());
+        }
+        Err(())
+    }
+}
+
 impl NumericalValue {
    pub fn numerical_type(&self) -> NumericalType {
        match self {
@@ -26,7 +45,7 @@ impl NumericalValue {
                if val <= i64::MAX as u64 {
                    NumericalValue::I64(val as i64)
                } else {
-                    NumericalValue::F64(val as f64)
+                    NumericalValue::U64(val)
                }
            }
            NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
 #[cfg(test)]
 mod tests {
    use super::NumericalType;
+    use crate::NumericalValue;

    #[test]
    fn test_numerical_type_code() {
@@ -153,4 +173,58 @@ mod tests {
        }
        assert_eq!(num_numerical_type, 3);
    }
+
+    #[test]
+    fn test_parse_numerical() {
+        assert_eq!(
+            "123".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(123)
+        );
+        assert_eq!(
+            "18446744073709551615".parse::<NumericalValue>().unwrap(),
+            NumericalValue::U64(18446744073709551615u64)
+        );
+        assert_eq!(
+            "1.0".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(1i64)
+        );
+        assert_eq!(
+            "1.1".parse::<NumericalValue>().unwrap(),
+            NumericalValue::F64(1.1f64)
+        );
+        assert_eq!(
+            "-1.0".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(-1i64)
+        );
+    }
+
+    #[test]
+    fn test_normalize_numerical() {
+        assert_eq!(
+            NumericalValue::from(1u64).normalize(),
+            NumericalValue::I64(1i64),
+        );
+        let limit_val = i64::MAX as u64 + 1u64;
+        assert_eq!(
+            NumericalValue::from(limit_val).normalize(),
+            NumericalValue::U64(limit_val),
+        );
+        assert_eq!(
+            NumericalValue::from(-1i64).normalize(),
+            NumericalValue::I64(-1i64),
+        );
+        assert_eq!(
+            NumericalValue::from(-2.0f64).normalize(),
+            NumericalValue::I64(-2i64),
+        );
+        assert_eq!(
+            NumericalValue::from(-2.1f64).normalize(),
+            NumericalValue::F64(-2.1f64),
+        );
+        let large_float = 2.0f64.powf(70.0f64);
+        assert_eq!(
+            NumericalValue::from(large_float).normalize(),
+            NumericalValue::F64(large_float),
+        );
+    }
 }
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -1,9 +1,9 @@
 [package]
 name = "tantivy-common"
-version = "0.9.0"
+version = "0.10.0"
 authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 license = "MIT"
-edition = "2021"
+edition = "2024"
 description = "common traits and utility functions used by multiple tantivy subcrates"
 documentation = "https://docs.rs/tantivy_common/"
 homepage = "https://github.com/quickwit-oss/tantivy"
--- a/common/benches/bench.rs
+++ b/common/benches/bench.rs
@@ -1,7 +1,7 @@
-use binggan::{black_box, BenchRunner};
+use binggan::{BenchRunner, black_box};
 use rand::seq::IteratorRandom;
 use rand::thread_rng;
-use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
+use tantivy_common::{BitSet, TinySet, serialize_vint_u32};

 fn bench_vint() {
    let mut runner = BenchRunner::new();
--- a/common/src/bitset.rs
+++ b/common/src/bitset.rs
@@ -183,7 +183,7 @@ pub struct BitSet {
 }

 fn num_buckets(max_val: u32) -> u32 {
-    (max_val + 63u32) / 64u32
+    max_val.div_ceil(64u32)
 }

 impl BitSet {
--- a/common/src/bounds.rs
+++ b/common/src/bounds.rs
@@ -65,11 +65,11 @@ pub fn transform_bound_inner_res<TFrom, TTo>(
 ) -> io::Result<Bound<TTo>> {
    use self::Bound::*;
    Ok(match bound {
-        Excluded(ref from_val) => match transform(from_val)? {
+        Excluded(from_val) => match transform(from_val)? {
            TransformBound::NewBound(new_val) => new_val,
            TransformBound::Existing(new_val) => Excluded(new_val),
        },
-        Included(ref from_val) => match transform(from_val)? {
+        Included(from_val) => match transform(from_val)? {
            TransformBound::NewBound(new_val) => new_val,
            TransformBound::Existing(new_val) => Included(new_val),
        },
@@ -85,11 +85,11 @@ pub fn transform_bound_inner<TFrom, TTo>(
 ) -> Bound<TTo> {
    use self::Bound::*;
    match bound {
-        Excluded(ref from_val) => match transform(from_val) {
+        Excluded(from_val) => match transform(from_val) {
            TransformBound::NewBound(new_val) => new_val,
            TransformBound::Existing(new_val) => Excluded(new_val),
        },
-        Included(ref from_val) => match transform(from_val) {
+        Included(from_val) => match transform(from_val) {
            TransformBound::NewBound(new_val) => new_val,
            TransformBound::Existing(new_val) => Included(new_val),
        },
@@ -111,8 +111,8 @@ pub fn map_bound<TFrom, TTo>(
 ) -> Bound<TTo> {
    use self::Bound::*;
    match bound {
-        Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
-        Included(ref from_val) => Bound::Included(transform(from_val)),
+        Excluded(from_val) => Bound::Excluded(transform(from_val)),
+        Included(from_val) => Bound::Included(transform(from_val)),
        Unbounded => Unbounded,
    }
 }
@@ -123,8 +123,8 @@ pub fn map_bound_res<TFrom, TTo, Err>(
 ) -> Result<Bound<TTo>, Err> {
    use self::Bound::*;
    Ok(match bound {
-        Excluded(ref from_val) => Excluded(transform(from_val)?),
-        Included(ref from_val) => Included(transform(from_val)?),
+        Excluded(from_val) => Excluded(transform(from_val)?),
+        Included(from_val) => Included(transform(from_val)?),
        Unbounded => Unbounded,
    })
 }
--- a/common/src/file_slice.rs
+++ b/common/src/file_slice.rs
@@ -74,7 +74,7 @@ impl FileHandle for WrapFile {
        {
            use std::io::{Read, Seek};
            let mut file = self.file.try_clone()?; // Clone the file to read from it separately
-                                                   // Seek to the start position in the file
+            // Seek to the start position in the file
            file.seek(io::SeekFrom::Start(start as u64))?;
            // Read the data into the buffer
            file.read_exact(&mut buffer)?;
@@ -346,8 +346,8 @@ mod tests {
    use std::sync::Arc;

    use super::{FileHandle, FileSlice};
-    use crate::file_slice::combine_ranges;
    use crate::HasLen;
+    use crate::file_slice::combine_ranges;

    #[test]
    fn test_file_slice() -> io::Result<()> {
--- a/common/src/lib.rs
+++ b/common/src/lib.rs
@@ -22,7 +22,7 @@ pub use json_path_writer::JsonPathWriter;
 pub use ownedbytes::{OwnedBytes, StableDeref};
 pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
 pub use vint::{
-    read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
+    VInt, VIntU128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint,
 };
 pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};

@@ -177,8 +177,10 @@ pub(crate) mod test {

    #[test]
    fn test_f64_order() {
-        assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
-            .contains(&f64_to_u64(f64::NAN))); // nan is not a number
+        assert!(
+            !(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
+                .contains(&f64_to_u64(f64::NAN))
+        ); // nan is not a number
        assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa
        assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent
        assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa
--- a/common/src/vint.rs
+++ b/common/src/vint.rs
@@ -29,6 +29,7 @@ impl BinarySerializable for VIntU128 {
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        #[allow(clippy::unbuffered_bytes)]
        let mut bytes = reader.bytes();
        let mut result = 0u128;
        let mut shift = 0u64;
@@ -52,7 +53,7 @@ impl BinarySerializable for VIntU128 {
    }
 }

-///   Wrapper over a `u64` that serializes as a variable int.
+/// Wrapper over a `u64` that serializes as a variable int.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct VInt(pub u64);

@@ -196,6 +197,7 @@ impl BinarySerializable for VInt {
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        #[allow(clippy::unbuffered_bytes)]
        let mut bytes = reader.bytes();
        let mut result = 0u64;
        let mut shift = 0u64;
@@ -222,7 +224,7 @@ impl BinarySerializable for VInt {
 #[cfg(test)]
 mod tests {

-    use super::{serialize_vint_u32, BinarySerializable, VInt};
+    use super::{BinarySerializable, VInt, serialize_vint_u32};

    fn aux_test_vint(val: u64) {
        let mut v = [14u8; 10];
--- a/doc/assets/images/paradedb.png
+++ b/doc/assets/images/paradedb.png
--- a/examples/basic_search.rs
+++ b/examples/basic_search.rs
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {

    // Our second field is body.
    // We want full-text search for it, but we do not
-    // need to be able to be able to retrieve it
+    // need to be able to retrieve it
    // for our application.
    //
    // We can make our index lighter by omitting the `STORED` flag.
--- a/query-grammar/Cargo.toml
+++ b/query-grammar/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy-query-grammar"
-version = "0.24.0"
+version = "0.25.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -9,9 +9,11 @@ homepage = "https://github.com/quickwit-oss/tantivy"
 repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
-edition = "2021"
+edition = "2024"

 [dependencies]
 nom = "7"
 serde = { version = "1.0.219", features = ["derive"] }
 serde_json = "1.0.140"
+ordered-float = "5.0.0"
+fnv = "1.0.7"
--- a/query-grammar/src/infallible.rs
+++ b/query-grammar/src/infallible.rs
@@ -117,6 +117,22 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
    }
 }

+pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
+    mut first: F,
+    mut second: G,
+) -> impl FnMut(I) -> JResult<I, O1>
+where
+    F: nom::Parser<I, (O1, ErrorList), Infallible>,
+    G: nom::Parser<I, (O2, ErrorList), Infallible>,
+{
+    move |input: I| {
+        let (input, (o1, mut err)) = first.parse(input)?;
+        let (input, (_, mut err2)) = second.parse(input)?;
+        err.append(&mut err2);
+        Ok((input, (o1, err)))
+    }
+}
+
 pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
    mut first: F,
    mut second: G,
@@ -186,19 +202,19 @@ macro_rules! tuple_trait_impl(
 );

 macro_rules! tuple_trait_inner(
-  ($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
+  ($it:tt, $self:expr_2021, $input:expr_2021, (), $error_list:expr_2021, $head:ident $($id:ident)+) => ({
    let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
    $error_list.append(&mut err);

    succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
  });
-  ($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
+  ($it:tt, $self:expr_2021, $input:expr_2021, ($($parsed:tt)*), $error_list:expr_2021, $head:ident $($id:ident)+) => ({
    let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
    $error_list.append(&mut err);

    succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
  });
-  ($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
+  ($it:tt, $self:expr_2021, $input:expr_2021, ($($parsed:tt)*), $error_list:expr_2021, $head:ident) => ({
    let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
    $error_list.append(&mut err);

@@ -328,13 +344,13 @@ macro_rules! alt_trait_impl(
 );

 macro_rules! alt_trait_inner(
-  ($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
+  ($it:tt, $self:expr_2021, $input:expr_2021, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
    match $self.$it.0.parse($input.clone()) {
      Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
      Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
    }
  );
-  ($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
+  ($it:tt, $self:expr_2021, $input:expr_2021, $head_cond:ident $head:ident) => (
    None
  );
 );
--- a/query-grammar/src/lib.rs
+++ b/query-grammar/src/lib.rs
@@ -31,7 +31,17 @@ pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {

 #[cfg(test)]
 mod tests {
-    use crate::{parse_query, parse_query_lenient};
+    use crate::{UserInputAst, parse_query, parse_query_lenient};
+
+    #[test]
+    fn test_deduplication() {
+        let ast: UserInputAst = parse_query("a a").unwrap();
+        let json = serde_json::to_string(&ast).unwrap();
+        assert_eq!(
+            json,
+            r#"{"type":"bool","clauses":[[null,{"type":"literal","field_name":null,"phrase":"a","delimiter":"none","slop":0,"prefix":false}]]}"#
+        );
+    }

    #[test]
    fn test_parse_query_serialization() {
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -1,6 +1,8 @@
 use std::borrow::Cow;
 use std::iter::once;

+use fnv::FnvHashSet;
+use nom::IResult;
 use nom::branch::alt;
 use nom::bytes::complete::tag;
 use nom::character::complete::{
@@ -10,12 +12,11 @@ use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify};
 use nom::error::{Error, ErrorKind};
 use nom::multi::{many0, many1, separated_list0};
 use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple};
-use nom::IResult;

 use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
+use crate::Occur;
 use crate::infallible::*;
 use crate::user_input_ast::Delimiter;
-use crate::Occur;

 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
 // special characters.
@@ -36,7 +37,7 @@ fn field_name(inp: &str) -> IResult<&str, String> {
                alt((first_char, escape_sequence())),
                many0(alt((simple_char, escape_sequence(), char('\\')))),
            )),
-            char(':'),
+            tuple((multispace0, char(':'), multispace0)),
        ),
        |(first_char, next)| once(first_char).chain(next).collect(),
    )(inp)
@@ -68,7 +69,7 @@ fn interpret_escape(source: &str) -> String {

 /// Consume a word outside of any context.
 // TODO should support escape sequences
-fn word(inp: &str) -> IResult<&str, Cow<str>> {
+fn word(inp: &str) -> IResult<&str, Cow<'_, str>> {
    map_res(
        recognize(tuple((
            alt((
@@ -305,15 +306,14 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
    let (inp, (field_name, _, _, _)) =
        tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");

-    let res = delimited_infallible(
+    delimited_infallible(
        nothing,
        map(ast_infallible, |(mut ast, errors)| {
            ast.set_default_field(field_name.to_string());
            (ast, errors)
        }),
        opt_i_err(char(')'), "expected ')'"),
-    )(inp);
-    res
+    )(inp)
 }

 fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
@@ -367,7 +367,10 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
    // something (a field name) got parsed before
    alt((
        map(
-            tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
+            tuple((
+                opt(field_name),
+                alt((range, set, exists, regex, term_or_phrase)),
+            )),
            |(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
        ),
        term_group,
@@ -389,6 +392,10 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
                        value((), peek(one_of("{[><"))),
                        map(range_infallible, |(range, errs)| (Some(range), errs)),
                    ),
+                    (
+                        value((), peek(one_of("/"))),
+                        map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
+                    ),
                ),
                delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
            ),
@@ -689,6 +696,61 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
    }
 }

+fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
+    map(
+        terminated(
+            delimited(
+                char('/'),
+                many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
+                char('/'),
+            ),
+            peek(alt((multispace1, eof))),
+        ),
+        |elements| UserInputLeaf::Regex {
+            field: None,
+            pattern: elements.into_iter().collect::<String>(),
+        },
+    )(inp)
+}
+
+fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
+    match terminated_infallible(
+        delimited_infallible(
+            opt_i_err(char('/'), "missing delimiter /"),
+            opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
+            opt_i_err(char('/'), "missing delimiter /"),
+        ),
+        opt_i_err(
+            peek(alt((multispace1, eof))),
+            "expected whitespace or end of input",
+        ),
+    )(inp)
+    {
+        Ok((rest, (elements_part, errors))) => {
+            let pattern = match elements_part {
+                Some(elements_part) => elements_part.into_iter().collect(),
+                None => String::new(),
+            };
+            let res = UserInputLeaf::Regex {
+                field: None,
+                pattern,
+            };
+            Ok((rest, (res, errors)))
+        }
+        Err(e) => {
+            let errs = vec![LenientErrorInternal {
+                pos: inp.len(),
+                message: e.to_string(),
+            }];
+            let res = UserInputLeaf::Regex {
+                field: None,
+                pattern: String::new(),
+            };
+            Ok((inp, (res, errs)))
+        }
+    }
+}
+
 fn negate(expr: UserInputAst) -> UserInputAst {
    expr.unary(Occur::MustNot)
 }
@@ -753,7 +815,7 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
        tuple((leaf, fallible(boost))),
        |(leaf, boost_opt)| match boost_opt {
            Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
-                UserInputAst::Boost(Box::new(leaf), boost)
+                UserInputAst::Boost(Box::new(leaf), boost.into())
            }
            _ => leaf,
        },
@@ -765,7 +827,7 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
        tuple_infallible((leaf_infallible, boost)),
        |((leaf, boost_opt), error)| match boost_opt {
            Some(boost) if (boost - 1.0).abs() > f64::EPSILON => (
-                leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)),
+                leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost.into())),
                error,
            ),
            _ => (leaf, error),
@@ -1016,12 +1078,25 @@ pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>
    (rewrite_ast(res), errors)
 }

-/// Removes unnecessary children clauses in AST
-///
-/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
 fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
-    if let UserInputAst::Clause(terms) = &mut input {
-        for term in terms {
+    if let UserInputAst::Clause(sub_clauses) = &mut input {
+        // call rewrite_ast recursively on children clauses if applicable
+        let mut new_clauses = Vec::with_capacity(sub_clauses.len());
+        for (occur, clause) in sub_clauses.drain(..) {
+            let rewritten_clause = rewrite_ast(clause);
+            new_clauses.push((occur, rewritten_clause));
+        }
+        *sub_clauses = new_clauses;
+
+        // remove duplicate child clauses
+        // e.g. (+a +b) OR (+c +d) OR (+a +b)  => (+a +b) OR (+c +d)
+        let mut seen = FnvHashSet::default();
+        sub_clauses.retain(|term| seen.insert(term.clone()));
+
+        // Removes unnecessary children clauses in AST
+        //
+        // Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
+        for term in sub_clauses {
            rewrite_ast_clause(term);
        }
    }
@@ -1030,7 +1105,7 @@ fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {

 fn rewrite_ast_clause(input: &mut (Option<Occur>, UserInputAst)) {
    match input {
-        (None, UserInputAst::Clause(ref mut clauses)) if clauses.len() == 1 => {
+        (None, UserInputAst::Clause(clauses)) if clauses.len() == 1 => {
            *input = clauses.pop().unwrap(); // safe because clauses.len() == 1
        }
        _ => {}
@@ -1283,6 +1358,10 @@ mod test {
            super::field_name("~my~field:a"),
            Ok(("a", "~my~field".to_string()))
        );
+        assert_eq!(
+            super::field_name(".my.field.name : a"),
+            Ok(("a", ".my.field.name".to_string()))
+        );
        for special_char in SPECIAL_CHARS.iter() {
            let query = &format!("\\{special_char}my\\{special_char}field:a");
            assert_eq!(
@@ -1376,7 +1455,7 @@ mod test {

    #[test]
    fn test_range_parser_lenient() {
-        let literal = |query| literal_infallible(query).unwrap().1 .0.unwrap();
+        let literal = |query| literal_infallible(query).unwrap().1.0.unwrap();

        // same tests as non-lenient
        let res = literal("title: <hello");
@@ -1689,4 +1768,72 @@ mod test {
    fn test_invalid_field() {
        test_is_parse_err(r#"!bc:def"#, "!bc:def");
    }
+
+    #[test]
+    fn test_regex_parser() {
+        let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
+        assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
+        let (_, input) = r.unwrap();
+        match input {
+            UserInputAst::Leaf(leaf) => match leaf.as_ref() {
+                UserInputLeaf::Regex { field, pattern } => {
+                    assert_eq!(field, &Some("a".to_string()));
+                    assert_eq!(pattern, "joh?n(ath[oa]n)");
+                }
+                _ => panic!("Expected a regex leaf, got {leaf:?}"),
+            },
+            _ => panic!("Expected a leaf"),
+        }
+        let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
+        assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
+        let (_, input) = r.unwrap();
+        match input {
+            UserInputAst::Leaf(leaf) => match leaf.as_ref() {
+                UserInputLeaf::Regex { field, pattern } => {
+                    assert_eq!(field, &Some("a".to_string()));
+                    assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
+                }
+                _ => panic!("Expected a regex leaf, got {leaf:?}"),
+            },
+            _ => panic!("Expected a leaf"),
+        }
+    }
+
+    #[test]
+    fn test_regex_parser_lenient() {
+        let literal = |query| literal_infallible(query).unwrap().1;
+
+        let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
+        let expected = UserInputLeaf::Regex {
+            field: Some("a".to_string()),
+            pattern: "joh?n(ath[oa]n)".to_string(),
+        }
+        .into();
+        assert_eq!(res.unwrap(), expected);
+        assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
+
+        let (res, errs) = literal("title:/joh?n(ath[oa]n)");
+        let expected = UserInputLeaf::Regex {
+            field: Some("title".to_string()),
+            pattern: "joh?n(ath[oa]n)".to_string(),
+        }
+        .into();
+        assert_eq!(res.unwrap(), expected);
+        assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
+        assert_eq!(
+            errs[0].message, "missing delimiter /",
+            "Unexpected error message",
+        );
+    }
+
+    #[test]
+    fn test_space_before_value() {
+        test_parse_query_to_ast_helper("field : a", r#""field":a"#);
+        test_parse_query_to_ast_helper("field:    a", r#""field":a"#);
+        test_parse_query_to_ast_helper("field         :a", r#""field":a"#);
+        test_parse_query_to_ast_helper(
+            "field : 'happy tax payer' AND other_field  : 1",
+            r#"(+"field":'happy tax payer' +"other_field":1)"#,
+        );
+    }
 }
--- a/query-grammar/src/user_input_ast.rs
+++ b/query-grammar/src/user_input_ast.rs
@@ -5,7 +5,7 @@ use serde::Serialize;

 use crate::Occur;

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(tag = "type")]
 #[serde(rename_all = "snake_case")]
 pub enum UserInputLeaf {
@@ -23,6 +23,10 @@ pub enum UserInputLeaf {
    Exists {
        field: String,
    },
+    Regex {
+        field: Option<String>,
+        pattern: String,
+    },
 }

 impl UserInputLeaf {
@@ -46,12 +50,13 @@ impl UserInputLeaf {
            UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
                field: field.expect("Exist query without a field isn't allowed"),
            },
+            UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
        }
    }

    pub(crate) fn set_default_field(&mut self, default_field: String) {
        match self {
-            UserInputLeaf::Literal(ref mut literal) if literal.field_name.is_none() => {
+            UserInputLeaf::Literal(literal) if literal.field_name.is_none() => {
                literal.field_name = Some(default_field)
            }
            UserInputLeaf::All => {
@@ -59,12 +64,8 @@ impl UserInputLeaf {
                    field: default_field,
                }
            }
-            UserInputLeaf::Range { ref mut field, .. } if field.is_none() => {
-                *field = Some(default_field)
-            }
-            UserInputLeaf::Set { ref mut field, .. } if field.is_none() => {
-                *field = Some(default_field)
-            }
+            UserInputLeaf::Range { field, .. } if field.is_none() => *field = Some(default_field),
+            UserInputLeaf::Set { field, .. } if field.is_none() => *field = Some(default_field),
            _ => (), // field was already set, do nothing
        }
    }
@@ -75,11 +76,11 @@ impl Debug for UserInputLeaf {
        match self {
            UserInputLeaf::Literal(literal) => literal.fmt(formatter),
            UserInputLeaf::Range {
-                ref field,
-                ref lower,
-                ref upper,
+                field,
+                lower,
+                upper,
            } => {
-                if let Some(ref field) = field {
+                if let Some(field) = field {
                    // TODO properly escape field (in case of \")
                    write!(formatter, "\"{field}\":")?;
                }
@@ -89,7 +90,7 @@ impl Debug for UserInputLeaf {
                Ok(())
            }
            UserInputLeaf::Set { field, elements } => {
-                if let Some(ref field) = field {
+                if let Some(field) = field {
                    // TODO properly escape field (in case of \")
                    write!(formatter, "\"{field}\": ")?;
                }
@@ -107,11 +108,19 @@ impl Debug for UserInputLeaf {
            UserInputLeaf::Exists { field } => {
                write!(formatter, "$exists(\"{field}\")")
            }
+            UserInputLeaf::Regex { field, pattern } => {
+                if let Some(field) = field {
+                    // TODO properly escape field (in case of \")
+                    write!(formatter, "\"{field}\":")?;
+                }
+                // TODO properly escape pattern (in case of \")
+                write!(formatter, "/{pattern}/")
+            }
        }
    }
 }

-#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub enum Delimiter {
    SingleQuotes,
@@ -119,7 +128,7 @@ pub enum Delimiter {
    None,
 }

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct UserInputLiteral {
    pub field_name: Option<String>,
@@ -158,7 +167,7 @@ impl fmt::Debug for UserInputLiteral {
    }
 }

-#[derive(PartialEq, Debug, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Debug, Clone, Serialize)]
 #[serde(tag = "type", content = "value")]
 #[serde(rename_all = "snake_case")]
 pub enum UserInputBound {
@@ -195,11 +204,11 @@ impl UserInputBound {
    }
 }

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(into = "UserInputAstSerde")]
 pub enum UserInputAst {
    Clause(Vec<(Option<Occur>, UserInputAst)>),
-    Boost(Box<UserInputAst>, f64),
+    Boost(Box<UserInputAst>, ordered_float::OrderedFloat<f64>),
    Leaf(Box<UserInputLeaf>),
 }

@@ -221,9 +230,10 @@ impl From<UserInputAst> for UserInputAstSerde {
    fn from(ast: UserInputAst) -> Self {
        match ast {
            UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
-            UserInputAst::Boost(underlying, boost) => {
-                UserInputAstSerde::Boost { underlying, boost }
-            }
+            UserInputAst::Boost(underlying, boost) => UserInputAstSerde::Boost {
+                underlying,
+                boost: boost.into_inner(),
+            },
            UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
        }
    }
@@ -267,7 +277,7 @@ impl UserInputAst {
                .iter_mut()
                .for_each(|(_, ast)| ast.set_default_field(field.clone())),
            UserInputAst::Leaf(leaf) => leaf.set_default_field(field),
-            UserInputAst::Boost(ref mut ast, _) => ast.set_default_field(field),
+            UserInputAst::Boost(ast, _) => ast.set_default_field(field),
        }
    }
 }
@@ -382,7 +392,7 @@ mod tests {
    #[test]
    fn test_boost_serialization() {
        let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
-        let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
+        let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5.into());
        let json = serde_json::to_string(&boost_ast).unwrap();
        assert_eq!(
            json,
@@ -409,7 +419,7 @@ mod tests {
                    }))),
                ),
            ])),
-            2.5,
+            2.5.into(),
        );
        let json = serde_json::to_string(&boost_ast).unwrap();
        assert_eq!(
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@@ -301,7 +301,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
        let bounds = self.bounds;
        let interval = self.interval;
        let offset = self.offset;
-        let get_bucket_pos = |val| (get_bucket_pos_f64(val, interval, offset) as i64);
+        let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;

        bucket_agg_accessor
            .column_block_accessor
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -518,7 +518,7 @@ impl SegmentTermCollector {
                |term| {
                    let entry = entries[idx];
                    let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
-                        .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
+                        .map_err(io::Error::other)?;
                    dict.insert(
                        IntermediateKey::Str(
                            String::from_utf8(term.to_vec()).expect("could not convert to String"),
--- a/src/aggregation/metric/top_hits.rs
+++ b/src/aggregation/metric/top_hits.rs
@@ -229,6 +229,7 @@ impl TopHitsAggregationReq {
        self.sort
            .iter()
            .map(|KeyOrder { field, .. }| field.as_str())
+            .chain(self.doc_value_fields.iter().map(|s| s.as_str()))
            .collect()
    }

--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -484,7 +484,6 @@ impl FacetCounts {
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeSet;
-    use std::iter;

    use columnar::Dictionary;
    use rand::distributions::Uniform;
@@ -739,7 +738,7 @@ mod tests {
                .flat_map(|(c, count)| {
                    let facet = Facet::from(&format!("/facet/{c}"));
                    let doc = doc!(facet_field => facet);
-                    iter::repeat(doc).take(count)
+                    std::iter::repeat_n(doc, count)
                })
                .map(|mut doc| {
                    doc.add_facet(
@@ -787,7 +786,7 @@ mod tests {
            .flat_map(|(c, count)| {
                let facet = Facet::from(&format!("/facet/{c}"));
                let doc = doc!(facet_field => facet);
-                iter::repeat(doc).take(count)
+                std::iter::repeat_n(doc, count)
            })
            .collect();

--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -2,11 +2,13 @@ use std::fmt;
 use std::marker::PhantomData;
 use std::sync::Arc;

-use columnar::ColumnValues;
+use columnar::{ColumnValues, StrColumn};
 use serde::{Deserialize, Serialize};

 use super::Collector;
-use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
+use crate::collector::custom_score_top_collector::{
+    CustomScoreTopCollector, CustomScoreTopSegmentCollector,
+};
 use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
 use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
 use crate::collector::{
@@ -14,6 +16,7 @@ use crate::collector::{
 };
 use crate::fastfield::{FastFieldNotAvailableError, FastValue};
 use crate::query::Weight;
+use crate::termdict::TermOrdinal;
 use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};

 struct FastFieldConvertCollector<
@@ -83,6 +86,163 @@ where
    }
 }

+struct StringConvertCollector {
+    pub collector: CustomScoreTopCollector<ScorerByField, u64>,
+    pub field: String,
+    order: Order,
+    limit: usize,
+    offset: usize,
+}
+
+impl Collector for StringConvertCollector {
+    type Fruit = Vec<(String, DocAddress)>;
+
+    type Child = StringConvertSegmentCollector;
+
+    fn for_segment(
+        &self,
+        segment_local_id: crate::SegmentOrdinal,
+        segment: &SegmentReader,
+    ) -> crate::Result<Self::Child> {
+        let schema = segment.schema();
+        let field = schema.get_field(&self.field)?;
+        let field_entry = schema.get_field_entry(field);
+        if !field_entry.is_fast() {
+            return Err(TantivyError::SchemaError(format!(
+                "Field {:?} is not a fast field.",
+                field_entry.name()
+            )));
+        }
+        let requested_type = crate::schema::Type::Str;
+        let schema_type = field_entry.field_type().value_type();
+        if schema_type != requested_type {
+            return Err(TantivyError::SchemaError(format!(
+                "Field {:?} is of type {schema_type:?}!={requested_type:?}",
+                field_entry.name()
+            )));
+        }
+        let ff = segment
+            .fast_fields()
+            .str(&self.field)?
+            .expect("ff should be a str field");
+        Ok(StringConvertSegmentCollector {
+            collector: self.collector.for_segment(segment_local_id, segment)?,
+            ff,
+            order: self.order.clone(),
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        self.collector.requires_scoring()
+    }
+
+    fn merge_fruits(
+        &self,
+        child_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
+    ) -> crate::Result<Self::Fruit> {
+        if self.limit == 0 {
+            return Ok(Vec::new());
+        }
+        if self.order.is_desc() {
+            let mut top_collector: TopNComputer<_, _, true> =
+                TopNComputer::new(self.limit + self.offset);
+            for child_fruit in child_fruits {
+                for (feature, doc) in child_fruit {
+                    top_collector.push(feature, doc);
+                }
+            }
+            Ok(top_collector
+                .into_sorted_vec()
+                .into_iter()
+                .skip(self.offset)
+                .map(|cdoc| (cdoc.feature, cdoc.doc))
+                .collect())
+        } else {
+            let mut top_collector: TopNComputer<_, _, false> =
+                TopNComputer::new(self.limit + self.offset);
+            for child_fruit in child_fruits {
+                for (feature, doc) in child_fruit {
+                    top_collector.push(feature, doc);
+                }
+            }
+
+            Ok(top_collector
+                .into_sorted_vec()
+                .into_iter()
+                .skip(self.offset)
+                .map(|cdoc| (cdoc.feature, cdoc.doc))
+                .collect())
+        }
+    }
+}
+
+struct StringConvertSegmentCollector {
+    pub collector: CustomScoreTopSegmentCollector<ScorerByFastFieldReader, u64>,
+    ff: StrColumn,
+    order: Order,
+}
+
+impl SegmentCollector for StringConvertSegmentCollector {
+    type Fruit = Vec<(String, DocAddress)>;
+
+    fn collect(&mut self, doc: DocId, score: Score) {
+        self.collector.collect(doc, score);
+    }
+
+    fn harvest(self) -> Vec<(String, DocAddress)> {
+        let top_ordinals: Vec<(TermOrdinal, DocAddress)> = self.collector.harvest();
+
+        // Collect terms.
+        let mut terms: Vec<String> = Vec::with_capacity(top_ordinals.len());
+        let result = if self.order.is_asc() {
+            self.ff.dictionary().sorted_ords_to_term_cb(
+                top_ordinals.iter().map(|(term_ord, _)| u64::MAX - term_ord),
+                |term| {
+                    terms.push(
+                        std::str::from_utf8(term)
+                            .expect("Failed to decode term as unicode")
+                            .to_owned(),
+                    );
+                    Ok(())
+                },
+            )
+        } else {
+            self.ff.dictionary().sorted_ords_to_term_cb(
+                top_ordinals.iter().rev().map(|(term_ord, _)| *term_ord),
+                |term| {
+                    terms.push(
+                        std::str::from_utf8(term)
+                            .expect("Failed to decode term as unicode")
+                            .to_owned(),
+                    );
+                    Ok(())
+                },
+            )
+        };
+
+        assert!(
+            result.expect("Failed to read terms from term dictionary"),
+            "Not all terms were matched in segment."
+        );
+
+        // Zip them back with their docs.
+        if self.order.is_asc() {
+            terms
+                .into_iter()
+                .zip(top_ordinals)
+                .map(|(term, (_, doc))| (term, doc))
+                .collect()
+        } else {
+            terms
+                .into_iter()
+                .rev()
+                .zip(top_ordinals)
+                .map(|(term, (_, doc))| (term, doc))
+                .collect()
+        }
+    }
+}
+
 /// The `TopDocs` collector keeps track of the top `K` documents
 /// sorted by their score.
 ///
@@ -410,6 +570,30 @@ impl TopDocs {
        }
    }

+    /// Like `order_by_fast_field`, but for a `String` fast field.
+    pub fn order_by_string_fast_field(
+        self,
+        fast_field: impl ToString,
+        order: Order,
+    ) -> impl Collector<Fruit = Vec<(String, DocAddress)>> {
+        let limit = self.0.limit;
+        let offset = self.0.offset;
+        let u64_collector = CustomScoreTopCollector::new(
+            ScorerByField {
+                field: fast_field.to_string(),
+                order: order.clone(),
+            },
+            self.0.into_tscore(),
+        );
+        StringConvertCollector {
+            collector: u64_collector,
+            field: fast_field.to_string(),
+            order,
+            limit,
+            offset,
+        }
+    }
+
    /// Ranks the documents using a custom score.
    ///
    /// This method offers a convenient way to tweak or replace
@@ -1109,6 +1293,220 @@ mod tests {
        assert_eq!(page_0, &page_2[..page_0.len()]);
    }

+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(20))]
+        /// Build multiple segments with equal-scoring docs and verify stable ordering
+        /// across pages when increasing limit or offset.
+        #[test]
+        fn proptest_stable_ordering_across_segments_with_pagination(
+            docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
+        ) {
+            use crate::indexer::NoMergePolicy;
+
+            // Build an index with multiple segments; all docs will have the same score using AllQuery.
+            let mut schema_builder = Schema::builder();
+            let text = schema_builder.add_text_field("text", TEXT);
+            let schema = schema_builder.build();
+            let index = Index::create_in_ram(schema);
+            let mut writer = index.writer_for_tests().unwrap();
+            writer.set_merge_policy(Box::new(NoMergePolicy));
+
+            for num_docs in &docs_per_segment {
+                for _ in 0..*num_docs {
+                    writer.add_document(doc!(text => "x")).unwrap();
+                }
+                writer.commit().unwrap();
+            }
+
+            let reader = index.reader().unwrap();
+            let searcher = reader.searcher();
+
+            let total_docs: usize = docs_per_segment.iter().sum();
+            // Full result set, first assert all scores are identical.
+            let full_with_scores: Vec<(Score, DocAddress)> = searcher
+                .search(&AllQuery, &TopDocs::with_limit(total_docs))
+                .unwrap();
+            // Sanity: at least one document was returned.
+            prop_assert!(!full_with_scores.is_empty());
+            let first_score = full_with_scores[0].0;
+            prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
+
+            // Keep only the addresses for the remaining checks.
+            let full: Vec<DocAddress> = full_with_scores
+                .into_iter()
+                .map(|(_score, addr)| addr)
+                .collect();
+
+            // Sanity: we actually created multiple segments and have documents.
+            prop_assert!(docs_per_segment.len() >= 2);
+            prop_assert!(total_docs >= 2);
+
+            // 1) Increasing limit should preserve prefix ordering.
+            for k in 1..=total_docs {
+                let page: Vec<DocAddress> = searcher
+                    .search(&AllQuery, &TopDocs::with_limit(k))
+                    .unwrap()
+                    .into_iter()
+                    .map(|(_score, addr)| addr)
+                    .collect();
+                prop_assert_eq!(page, full[..k].to_vec());
+            }
+
+            // 2) Offset + limit pages should always match the corresponding slice.
+            //    For each offset, check three representative page sizes:
+            //    - first page (size 1)
+            //    - a middle page (roughly half of remaining)
+            //    - the last page (size = remaining)
+            for offset in 0..total_docs {
+                let remaining = total_docs - offset;
+
+                let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
+                    let page: Vec<DocAddress> = searcher
+                        .search(&AllQuery, &TopDocs::with_limit(limit).and_offset(offset))
+                        .unwrap()
+                        .into_iter()
+                        .map(|(_score, addr)| addr)
+                        .collect();
+                    prop_assert_eq!(page, full[offset..offset + limit].to_vec());
+                    Ok(())
+                };
+
+                // Smallest page.
+                assert_page_eq(1)?;
+                // A middle-sized page (dedupes to 1 if remaining == 1).
+                assert_page_eq((remaining / 2).max(1))?;
+                // Largest page for this offset.
+                assert_page_eq(remaining)?;
+            }
+
+            // 3) Concatenating fixed-size pages by offset reproduces the full order.
+            for page_size in 1..=total_docs.min(5) {
+                let mut concat: Vec<DocAddress> = Vec::new();
+                let mut offset = 0;
+                while offset < total_docs {
+                    let size = page_size.min(total_docs - offset);
+                    let page: Vec<DocAddress> = searcher
+                        .search(&AllQuery, &TopDocs::with_limit(size).and_offset(offset))
+                        .unwrap()
+                        .into_iter()
+                        .map(|(_score, addr)| addr)
+                        .collect();
+                    concat.extend(page);
+                    offset += size;
+                }
+                // Avoid moving `full` across loop iterations.
+                prop_assert_eq!(concat, full.clone());
+            }
+        }
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(20))]
+        /// Build multiple segments with same-scoring term matches and verify stable ordering
+        /// across pages for a real scoring query (TermQuery with identical TF and fieldnorm).
+        #[test]
+        fn proptest_stable_ordering_across_segments_with_term_query_and_pagination(
+            docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
+        ) {
+            use crate::indexer::NoMergePolicy;
+            use crate::schema::IndexRecordOption;
+            use crate::query::TermQuery;
+            use crate::Term;
+
+            // Build an index with multiple segments; each doc has exactly one token "x",
+            // ensuring equal BM25 scores across all matching docs (same TF=1 and fieldnorm=1).
+            let mut schema_builder = Schema::builder();
+            let text = schema_builder.add_text_field("text", TEXT);
+            let schema = schema_builder.build();
+            let index = Index::create_in_ram(schema);
+            let mut writer = index.writer_for_tests().unwrap();
+            writer.set_merge_policy(Box::new(NoMergePolicy));
+
+            for num_docs in &docs_per_segment {
+                for _ in 0..*num_docs {
+                    writer.add_document(doc!(text => "x")).unwrap();
+                }
+                writer.commit().unwrap();
+            }
+
+            let reader = index.reader().unwrap();
+            let searcher = reader.searcher();
+
+            let total_docs: usize = docs_per_segment.iter().sum();
+            let term = Term::from_field_text(text, "x");
+            let tq = TermQuery::new(term, IndexRecordOption::WithFreqs);
+
+            // Full result set, first assert all scores are identical across docs.
+            let full_with_scores: Vec<(Score, DocAddress)> = searcher
+                .search(&tq, &TopDocs::with_limit(total_docs))
+                .unwrap();
+            // Sanity: at least one document was returned.
+            prop_assert!(!full_with_scores.is_empty());
+            let first_score = full_with_scores[0].0;
+            prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
+
+            // Keep only the addresses for the remaining checks.
+            let full: Vec<DocAddress> = full_with_scores
+                .into_iter()
+                .map(|(_score, addr)| addr)
+                .collect();
+
+            // Sanity: we actually created multiple segments and have documents.
+            prop_assert!(docs_per_segment.len() >= 2);
+            prop_assert!(total_docs >= 2);
+
+            // 1) Increasing limit should preserve prefix ordering.
+            for k in 1..=total_docs {
+                let page: Vec<DocAddress> = searcher
+                    .search(&tq, &TopDocs::with_limit(k))
+                    .unwrap()
+                    .into_iter()
+                    .map(|(_score, addr)| addr)
+                    .collect();
+                prop_assert_eq!(page, full[..k].to_vec());
+            }
+
+            // 2) Offset + limit pages should always match the corresponding slice.
+            //    Check three representative page sizes for each offset: 1, ~half, and remaining.
+            for offset in 0..total_docs {
+                let remaining = total_docs - offset;
+
+                let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
+                    let page: Vec<DocAddress> = searcher
+                        .search(&tq, &TopDocs::with_limit(limit).and_offset(offset))
+                        .unwrap()
+                        .into_iter()
+                        .map(|(_score, addr)| addr)
+                        .collect();
+                    prop_assert_eq!(page, full[offset..offset + limit].to_vec());
+                    Ok(())
+                };
+
+                assert_page_eq(1)?;
+                assert_page_eq((remaining / 2).max(1))?;
+                assert_page_eq(remaining)?;
+            }
+
+            // 3) Concatenating fixed-size pages by offset reproduces the full order.
+            for page_size in 1..=total_docs.min(5) {
+                let mut concat: Vec<DocAddress> = Vec::new();
+                let mut offset = 0;
+                while offset < total_docs {
+                    let size = page_size.min(total_docs - offset);
+                    let page: Vec<DocAddress> = searcher
+                        .search(&tq, &TopDocs::with_limit(size).and_offset(offset))
+                        .unwrap()
+                        .into_iter()
+                        .map(|(_score, addr)| addr)
+                        .collect();
+                    concat.extend(page);
+                    offset += size;
+                }
+                prop_assert_eq!(concat, full.clone());
+            }
+        }
+    }
+
    #[test]
    #[should_panic]
    fn test_top_0() {
@@ -1257,6 +1655,160 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn test_top_field_collector_string() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let city = schema_builder.add_text_field("city", TEXT | FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests()?;
+        index_writer.add_document(doc!(
+                city => "austin",
+        ))?;
+        index_writer.add_document(doc!(
+                city => "greenville",
+        ))?;
+        index_writer.add_document(doc!(
+            city => "tokyo",
+        ))?;
+        index_writer.commit()?;
+
+        fn query(
+            index: &Index,
+            order: Order,
+            limit: usize,
+            offset: usize,
+        ) -> crate::Result<Vec<(String, DocAddress)>> {
+            let searcher = index.reader()?.searcher();
+            let top_collector = TopDocs::with_limit(limit)
+                .and_offset(offset)
+                .order_by_string_fast_field("city", order);
+            searcher.search(&AllQuery, &top_collector)
+        }
+
+        assert_eq!(
+            &query(&index, Order::Desc, 3, 0)?,
+            &[
+                ("tokyo".to_owned(), DocAddress::new(0, 2)),
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+                ("austin".to_owned(), DocAddress::new(0, 0)),
+            ]
+        );
+
+        assert_eq!(
+            &query(&index, Order::Desc, 2, 0)?,
+            &[
+                ("tokyo".to_owned(), DocAddress::new(0, 2)),
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+            ]
+        );
+
+        assert_eq!(&query(&index, Order::Desc, 3, 3)?, &[]);
+
+        assert_eq!(
+            &query(&index, Order::Desc, 2, 1)?,
+            &[
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+                ("austin".to_owned(), DocAddress::new(0, 0)),
+            ]
+        );
+
+        assert_eq!(
+            &query(&index, Order::Asc, 3, 0)?,
+            &[
+                ("austin".to_owned(), DocAddress::new(0, 0)),
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+                ("tokyo".to_owned(), DocAddress::new(0, 2)),
+            ]
+        );
+
+        assert_eq!(
+            &query(&index, Order::Asc, 2, 1)?,
+            &[
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+                ("tokyo".to_owned(), DocAddress::new(0, 2)),
+            ]
+        );
+
+        assert_eq!(
+            &query(&index, Order::Asc, 2, 0)?,
+            &[
+                ("austin".to_owned(), DocAddress::new(0, 0)),
+                ("greenville".to_owned(), DocAddress::new(0, 1)),
+            ]
+        );
+
+        assert_eq!(&query(&index, Order::Asc, 3, 3)?, &[]);
+
+        Ok(())
+    }
+
+    proptest! {
+        #[test]
+        fn test_top_field_collect_string_prop(
+          order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
+          limit in 1..256_usize,
+          offset in 0..256_usize,
+          segments_terms in
+            proptest::collection::vec(
+                proptest::collection::vec(0..32_u8, 1..32_usize),
+                0..8_usize,
+            )
+        ) {
+            let mut schema_builder = Schema::builder();
+            let city = schema_builder.add_text_field("city", TEXT | FAST);
+            let schema = schema_builder.build();
+            let index = Index::create_in_ram(schema);
+            let mut index_writer = index.writer_for_tests()?;
+
+            // A Vec<Vec<u8>>, where the outer Vec represents segments, and the inner Vec
+            // represents terms.
+            for segment_terms in segments_terms.into_iter() {
+                for term in segment_terms.into_iter() {
+                    let term = format!("{term:0>3}");
+                    index_writer.add_document(doc!(
+                        city => term,
+                    ))?;
+                }
+                index_writer.commit()?;
+            }
+
+            let searcher = index.reader()?.searcher();
+            let top_n_results = searcher.search(&AllQuery, &TopDocs::with_limit(limit)
+                .and_offset(offset)
+                .order_by_string_fast_field("city", order.clone()))?;
+            let all_results = searcher.search(&AllQuery, &DocSetCollector)?.into_iter().map(|doc_address| {
+                // Get the term for this address.
+                // NOTE: We can't determine the SegmentIds that will be generated for Segments
+                // ahead of time, so we can't pre-compute the expected `DocAddress`es.
+                let column = searcher.segment_readers()[doc_address.segment_ord as usize].fast_fields().str("city").unwrap().unwrap();
+                let term_ord = column.term_ords(doc_address.doc_id).next().unwrap();
+                let mut city = Vec::new();
+                column.dictionary().ord_to_term(term_ord, &mut city).unwrap();
+                (String::try_from(city).unwrap(), doc_address)
+            });
+
+            // Using the TopDocs collector should always be equivalent to sorting, skipping the
+            // offset, and then taking the limit.
+            let sorted_docs: Vec<_> = if order.is_desc() {
+                let mut comparable_docs: Vec<ComparableDoc<_, _, true>> =
+                    all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect();
+                comparable_docs.sort();
+                comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect()
+            } else {
+                let mut comparable_docs: Vec<ComparableDoc<_, _, false>> =
+                    all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect();
+                comparable_docs.sort();
+                comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect()
+            };
+            let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::<Vec<_>>();
+            prop_assert_eq!(
+                expected_docs,
+                top_n_results
+            );
+        }
+    }
+
    #[test]
    #[should_panic]
    fn test_field_does_not_exist() {
--- a/src/compat_tests.rs
+++ b/src/compat_tests.rs
@@ -30,7 +30,7 @@ fn create_format() {
 }

 fn path_for_version(version: &str) -> String {
-    format!("./tests/compat_tests_data/index_v{}/", version)
+    format!("./tests/compat_tests_data/index_v{version}/")
 }

 /// feature flag quickwit uses a different dictionary type
--- a/src/core/executor.rs
+++ b/src/core/executor.rs
@@ -65,8 +65,7 @@ impl Executor {
                                if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
                                    error!(
                                        "Failed to send search task. It probably means all search \
-                                         threads have panicked. {:?}",
-                                        err
+                                         threads have panicked. {err:?}"
                                    );
                                }
                            });
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -1,3 +1,4 @@
+use columnar::NumericalValue;
 use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
 use common::{replace_in_place, JsonPathWriter};
 use rustc_hash::FxHashMap;
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
                if let Ok(i64_val) = val.try_into() {
                    term_buffer.append_type_and_fast_value::<i64>(i64_val);
                } else {
-                    term_buffer.append_type_and_fast_value(val);
+                    term_buffer.append_type_and_fast_value::<u64>(val);
                }
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
            ReferenceValueLeaf::F64(val) => {
+                if !val.is_finite() {
+                    return;
+                };
                set_path_id(
                    term_buffer,
                    ctx.path_to_unordered_id
                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
                );
-                term_buffer.append_type_and_fast_value(val);
+                // Normalize here is important.
+                // In the inverted index, we coerce all numerical values to their canonical
+                // representation.
+                //
+                // (We do the same thing on the query side)
+                match NumericalValue::F64(val).normalize() {
+                    NumericalValue::I64(val_i64) => {
+                        term_buffer.append_type_and_fast_value::<i64>(val_i64);
+                    }
+                    NumericalValue::U64(val_u64) => {
+                        term_buffer.append_type_and_fast_value::<u64>(val_u64);
+                    }
+                    NumericalValue::F64(val_f64) => {
+                        term_buffer.append_type_and_fast_value::<f64>(val_f64);
+                    }
+                }
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
            ReferenceValueLeaf::Bool(val) => {
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
 ///
 /// The term must be json + JSON path.
 pub fn convert_to_fast_value_and_append_to_json_term(
-    mut term: Term,
-    phrase: &str,
+    term: &Term,
+    text: &str,
    truncate_date_for_search: bool,
 ) -> Option<Term> {
    assert_eq!(
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
        0,
        "JSON value bytes should be empty"
    );
-    if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
-        let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
-        if truncate_date_for_search {
-            dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
+    try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
+        .or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
+        .or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
+}
+
+fn try_convert_to_datetime_and_append_to_json_term(
+    term: &Term,
+    text: &str,
+    truncate_date_for_search: bool,
+) -> Option<Term> {
+    let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
+    let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
+    if truncate_date_for_search {
+        dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
+    }
+    let mut term_clone = term.clone();
+    term_clone.append_type_and_fast_value(dt);
+    Some(term_clone)
+}
+
+fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
+    let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
+    let mut term_clone = term.clone();
+    // Parse is actually returning normalized values already today, but let's not
+    // not rely on that hidden contract.
+    match numerical_value.normalize() {
+        NumericalValue::I64(i64_value) => {
+            term_clone.append_type_and_fast_value::<i64>(i64_value);
+        }
+        NumericalValue::U64(u64_value) => {
+            term_clone.append_type_and_fast_value::<u64>(u64_value);
+        }
+        NumericalValue::F64(f64_value) => {
+            term_clone.append_type_and_fast_value::<f64>(f64_value);
        }
-        term.append_type_and_fast_value(dt);
-        return Some(term);
    }
-    if let Ok(i64_val) = str::parse::<i64>(phrase) {
-        term.append_type_and_fast_value(i64_val);
-        return Some(term);
-    }
-    if let Ok(u64_val) = str::parse::<u64>(phrase) {
-        term.append_type_and_fast_value(u64_val);
-        return Some(term);
-    }
-    if let Ok(f64_val) = str::parse::<f64>(phrase) {
-        term.append_type_and_fast_value(f64_val);
-        return Some(term);
-    }
-    if let Ok(bool_val) = str::parse::<bool>(phrase) {
-        term.append_type_and_fast_value(bool_val);
-        return Some(term);
-    }
-    None
+    Some(term_clone)
+}
+
+fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
+    let val = str::parse::<bool>(text).ok()?;
+    let mut term_clone = term.clone();
+    term_clone.append_type_and_fast_value(val);
+    Some(term_clone)
 }

 /// Splits a json path supplied to the query parser in such a way that
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -214,7 +214,7 @@ impl Searcher {
    /// It is powerless at making search faster if your index consists in
    /// one large segment.
    ///
-    /// Also, keep in my multithreading a single query on several
+    /// Also, keep in mind multithreading a single query on several
    /// threads will not improve your throughput. It can actually
    /// hurt it. It will however, decrease the average response time.
    pub fn search_with_executor<C: Collector>(
--- a/src/directory/directory.rs
+++ b/src/directory/directory.rs
@@ -56,7 +56,7 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
 impl Drop for DirectoryLockGuard {
    fn drop(&mut self) {
        if let Err(e) = self.directory.delete(&self.path) {
-            error!("Failed to remove the lock file. {:?}", e);
+            error!("Failed to remove the lock file. {e:?}");
        }
    }
 }
--- a/src/directory/file_watcher.rs
+++ b/src/directory/file_watcher.rs
@@ -51,7 +51,7 @@ impl FileWatcher {
                            .map(|current_checksum| current_checksum != checksum)
                            .unwrap_or(true);
                        if metafile_has_changed {
-                            info!("Meta file {:?} was modified", path);
+                            info!("Meta file {path:?} was modified");
                            current_checksum_opt = Some(checksum);
                            // We actually ignore callbacks failing here.
                            // We just wait for the end of their execution.
@@ -75,7 +75,7 @@ impl FileWatcher {
        let reader = match fs::File::open(path) {
            Ok(f) => io::BufReader::new(f),
            Err(e) => {
-                warn!("Failed to open meta file {:?}: {:?}", path, e);
+                warn!("Failed to open meta file {path:?}: {e:?}");
                return Err(e);
            }
        };
--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -157,7 +157,7 @@ impl ManagedDirectory {
        for file_to_delete in files_to_delete {
            match self.delete(&file_to_delete) {
                Ok(_) => {
-                    info!("Deleted {:?}", file_to_delete);
+                    info!("Deleted {file_to_delete:?}");
                    deleted_files.push(file_to_delete);
                }
                Err(file_error) => {
@@ -170,7 +170,7 @@ impl ManagedDirectory {
                            if !cfg!(target_os = "windows") {
                                // On windows, delete is expected to fail if the file
                                // is mmapped.
-                                error!("Failed to delete {:?}", file_to_delete);
+                                error!("Failed to delete {file_to_delete:?}");
                            }
                        }
                    }
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -7,7 +7,7 @@ use std::path::{Path, PathBuf};
 use std::sync::{Arc, RwLock, Weak};

 use common::StableDeref;
-use fs4::FileExt;
+use fs4::fs_std::FileExt;
 #[cfg(all(feature = "mmap", unix))]
 pub use memmap2::Advice;
 use memmap2::Mmap;
@@ -29,7 +29,7 @@ pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;

 /// Create a default io error given a string.
 pub(crate) fn make_io_err(msg: String) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, msg)
+    io::Error::other(msg)
 }

 /// Returns `None` iff the file exists, can be read, but is empty (and hence
@@ -369,7 +369,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {

 impl Directory for MmapDirectory {
    fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
-        debug!("Open Read {:?}", path);
+        debug!("Open Read {path:?}");
        let full_path = self.resolve_path(path);

        let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
@@ -414,7 +414,7 @@ impl Directory for MmapDirectory {
    }

    fn open_write(&self, path: &Path) -> Result<WritePtr, OpenWriteError> {
-        debug!("Open Write {:?}", path);
+        debug!("Open Write {path:?}");
        let full_path = self.resolve_path(path);

        let open_res = OpenOptions::new()
@@ -467,7 +467,7 @@ impl Directory for MmapDirectory {
    }

    fn atomic_write(&self, path: &Path, content: &[u8]) -> io::Result<()> {
-        debug!("Atomic Write {:?}", path);
+        debug!("Atomic Write {path:?}");
        let full_path = self.resolve_path(path);
        atomic_write(&full_path, content)?;
        Ok(())
@@ -484,8 +484,8 @@ impl Directory for MmapDirectory {
            .map_err(LockError::wrap_io_error)?;
        if lock.is_blocking {
            file.lock_exclusive().map_err(LockError::wrap_io_error)?;
-        } else {
-            file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
+        } else if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
+            return Err(LockError::LockBusy);
        }
        // dropping the file handle will release the lock.
        Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
--- a/src/directory/ram_directory.rs
+++ b/src/directory/ram_directory.rs
@@ -191,7 +191,7 @@ impl Directory for RamDirectory {
            .fs
            .read()
            .map_err(|e| OpenReadError::IoError {
-                io_error: Arc::new(io::Error::new(io::ErrorKind::Other, e.to_string())),
+                io_error: Arc::new(io::Error::other(e.to_string())),
                filepath: path.to_path_buf(),
            })?
            .exists(path))
--- a/src/directory/watch_event_router.rs
+++ b/src/directory/watch_event_router.rs
@@ -90,10 +90,7 @@ impl WatchCallbackList {
                let _ = sender.send(Ok(()));
            });
        if let Err(err) = spawn_res {
-            error!(
-                "Failed to spawn thread to call watch callbacks. Cause: {:?}",
-                err
-            );
+            error!("Failed to spawn thread to call watch callbacks. Cause: {err:?}");
        }
        result
    }
--- a/src/index/index.rs
+++ b/src/index/index.rs
@@ -216,7 +216,7 @@ impl IndexBuilder {

    /// Opens or creates a new index in the provided directory
    pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
-        let dir = dir.into();
+        let dir: Box<dyn Directory> = dir.into();
        if !Index::exists(&*dir)? {
            return self.create(dir);
        }
@@ -494,7 +494,7 @@ impl Index {
            .into_iter()
            .map(|segment| SegmentReader::open(&segment)?.fields_metadata())
            .collect::<Result<_, _>>()?;
-        Ok(merge_field_meta_data(fields_metadata, &self.schema()))
+        Ok(merge_field_meta_data(fields_metadata))
    }

    /// Creates a new segment_meta (Advanced user only).
--- a/src/index/inverted_index_reader.rs
+++ b/src/index/inverted_index_reader.rs
@@ -1,8 +1,7 @@
 use std::io;

 use common::json_path_writer::JSON_END_OF_PATH;
-use common::BinarySerializable;
-use fnv::FnvHashSet;
+use common::{BinarySerializable, ByteCount};
 #[cfg(feature = "quickwit")]
 use futures_util::{FutureExt, StreamExt, TryStreamExt};
 #[cfg(feature = "quickwit")]
@@ -36,6 +35,33 @@ pub struct InvertedIndexReader {
    total_num_tokens: u64,
 }

+/// Object that records the amount of space used by a field in an inverted index.
+pub(crate) struct InvertedIndexFieldSpace {
+    pub field_name: String,
+    pub field_type: Type,
+    pub postings_size: ByteCount,
+    pub positions_size: ByteCount,
+    pub num_terms: u64,
+}
+
+/// Returns None if the term is not a valid JSON path.
+fn extract_field_name_and_field_type_from_json_path(term: &[u8]) -> Option<(String, Type)> {
+    let index = term.iter().position(|&byte| byte == JSON_END_OF_PATH)?;
+    let field_type_code = term.get(index + 1).copied()?;
+    let field_type = Type::from_code(field_type_code)?;
+    // Let's flush the current field.
+    let field_name = String::from_utf8_lossy(&term[..index]).to_string();
+    Some((field_name, field_type))
+}
+
+impl InvertedIndexFieldSpace {
+    fn record(&mut self, term_info: &TermInfo) {
+        self.postings_size += ByteCount::from(term_info.posting_num_bytes() as u64);
+        self.positions_size += ByteCount::from(term_info.positions_num_bytes() as u64);
+        self.num_terms += 1;
+    }
+}
+
 impl InvertedIndexReader {
    pub(crate) fn new(
        termdict: TermDictionary,
@@ -81,20 +107,56 @@ impl InvertedIndexReader {
    ///
    /// Notice: This requires a full scan and therefore **very expensive**.
    /// TODO: Move to sstable to use the index.
-    pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
+    pub(crate) fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
        let mut stream = self.termdict.stream()?;
-        let mut fields = Vec::new();
-        let mut fields_set = FnvHashSet::default();
-        while let Some((term, _term_info)) = stream.next() {
-            if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
-                if !fields_set.contains(&term[..index + 2]) {
-                    fields_set.insert(term[..index + 2].to_vec());
-                    let typ = Type::from_code(term[index + 1]).unwrap();
-                    fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
+        let mut fields: Vec<InvertedIndexFieldSpace> = Vec::new();
+
+        let mut current_field_opt: Option<InvertedIndexFieldSpace> = None;
+        // Current field bytes, including the JSON_END_OF_PATH.
+        let mut current_field_bytes: Vec<u8> = Vec::new();
+
+        while let Some((term, term_info)) = stream.next() {
+            if let Some(current_field) = &mut current_field_opt {
+                if term.starts_with(&current_field_bytes) {
+                    // We are still in the same field.
+                    current_field.record(term_info);
+                    continue;
                }
            }
+
+            // This is a new field!
+            // Let's flush the current field.
+            fields.extend(current_field_opt.take());
+            current_field_bytes.clear();
+
+            // And create a new one.
+            let Some((field_name, field_type)) =
+                extract_field_name_and_field_type_from_json_path(term)
+            else {
+                error!(
+                    "invalid term bytes encountered {term:?}. this only happens if the term \
+                     dictionary is corrupted. please report"
+                );
+                continue;
+            };
+            let mut field_space = InvertedIndexFieldSpace {
+                field_name,
+                field_type,
+                postings_size: ByteCount::default(),
+                positions_size: ByteCount::default(),
+                num_terms: 0u64,
+            };
+            field_space.record(term_info);
+
+            // We include the json type and the json end of path to make sure the prefix check
+            // is meaningful.
+            current_field_bytes.extend_from_slice(&term[..field_space.field_name.len() + 2]);
+            current_field_opt = Some(field_space);
        }

+        // We need to flush the last field as well.
+        fields.extend(current_field_opt.take());
+
        Ok(fields)
    }

--- a/src/index/segment_reader.rs
+++ b/src/index/segment_reader.rs
@@ -1,8 +1,8 @@
 use std::collections::HashMap;
-use std::ops::BitOrAssign;
 use std::sync::{Arc, RwLock};
 use std::{fmt, io};

+use common::{ByteCount, HasLen};
 use fnv::FnvHashMap;
 use itertools::Itertools;

@@ -304,12 +304,16 @@ impl SegmentReader {
        for (field, field_entry) in self.schema().fields() {
            let field_name = field_entry.name().to_string();
            let is_indexed = field_entry.is_indexed();
-
            if is_indexed {
                let is_json = field_entry.field_type().value_type() == Type::Json;
                if is_json {
+                    let term_dictionary_json_field_num_bytes: u64 = self
+                        .termdict_composite
+                        .open_read(field)
+                        .map(|file_slice| file_slice.len() as u64)
+                        .unwrap_or(0u64);
                    let inv_index = self.inverted_index(field)?;
-                    let encoded_fields_in_index = inv_index.list_encoded_fields()?;
+                    let encoded_fields_in_index = inv_index.list_encoded_json_fields()?;
                    let mut build_path = |field_name: &str, mut json_path: String| {
                        // In this case we need to map the potential fast field to the field name
                        // accepted by the query parser.
@@ -328,30 +332,65 @@ impl SegmentReader {
                            format!("{field_name}.{json_path}")
                        }
                    };
-                    indexed_fields.extend(
-                        encoded_fields_in_index
-                            .into_iter()
-                            .map(|(name, typ)| (build_path(&field_name, name), typ))
-                            .map(|(field_name, typ)| FieldMetadata {
-                                indexed: true,
-                                stored: false,
-                                field_name,
-                                fast: false,
-                                typ,
-                            }),
-                    );
+                    let total_num_terms = encoded_fields_in_index
+                        .iter()
+                        .map(|field_space| field_space.num_terms)
+                        .sum();
+                    indexed_fields.extend(encoded_fields_in_index.into_iter().map(|field_space| {
+                        let field_name = build_path(&field_name, field_space.field_name);
+                        // It is complex to attribute the exact amount of bytes required by specific
+                        // field in the json field. Instead, as a proxy, we
+                        // attribute the total amount of bytes for the entire json field,
+                        // proportionally to the number of terms in each
+                        // fields.
+                        let term_dictionary_size = (term_dictionary_json_field_num_bytes
+                            * field_space.num_terms)
+                            .checked_div(total_num_terms)
+                            .unwrap_or(0);
+                        FieldMetadata {
+                            postings_size: Some(field_space.postings_size),
+                            positions_size: Some(field_space.positions_size),
+                            term_dictionary_size: Some(ByteCount::from(term_dictionary_size)),
+                            fast_size: None,
+                            // The stored flag will be set at the end of this function!
+                            stored: field_entry.is_stored(),
+                            field_name,
+                            typ: field_space.field_type,
+                        }
+                    }));
                } else {
+                    let postings_size: ByteCount = self
+                        .postings_composite
+                        .open_read(field)
+                        .map(|posting_fileslice| posting_fileslice.len())
+                        .unwrap_or(0)
+                        .into();
+                    let positions_size: ByteCount = self
+                        .positions_composite
+                        .open_read(field)
+                        .map(|positions_fileslice| positions_fileslice.len())
+                        .unwrap_or(0)
+                        .into();
+                    let term_dictionary_size: ByteCount = self
+                        .termdict_composite
+                        .open_read(field)
+                        .map(|term_dictionary_fileslice| term_dictionary_fileslice.len())
+                        .unwrap_or(0)
+                        .into();
                    indexed_fields.push(FieldMetadata {
-                        indexed: true,
-                        stored: false,
                        field_name: field_name.to_string(),
-                        fast: false,
                        typ: field_entry.field_type().value_type(),
+                        // The stored flag will be set at the end of this function!
+                        stored: field_entry.is_stored(),
+                        fast_size: None,
+                        term_dictionary_size: Some(term_dictionary_size),
+                        postings_size: Some(postings_size),
+                        positions_size: Some(positions_size),
                    });
                }
            }
        }
-        let mut fast_fields: Vec<FieldMetadata> = self
+        let fast_fields: Vec<FieldMetadata> = self
            .fast_fields()
            .columnar()
            .iter_columns()?
@@ -363,23 +402,21 @@ impl SegmentReader {
                    .get(&field_name)
                    .unwrap_or(&field_name)
                    .to_string();
+                let stored = is_field_stored(&field_name, &self.schema);
                FieldMetadata {
-                    indexed: false,
-                    stored: false,
                    field_name,
-                    fast: true,
                    typ: Type::from(handle.column_type()),
+                    stored,
+                    fast_size: Some(handle.num_bytes()),
+                    term_dictionary_size: None,
+                    postings_size: None,
+                    positions_size: None,
                }
            })
            .collect();
-        // Since the type is encoded differently in the fast field and in the inverted index,
-        // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
-        // If we are sure that the order is the same, we can remove this sort.
-        indexed_fields.sort_unstable();
-        fast_fields.sort_unstable();
-        let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
-
-        Ok(merged)
+        let merged_field_metadatas: Vec<FieldMetadata> =
+            merge_field_meta_data(vec![indexed_fields, fast_fields]);
+        Ok(merged_field_metadatas)
    }

    /// Returns the segment id
@@ -443,20 +480,47 @@ pub struct FieldMetadata {
    // Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
    // field_name then typ.
    pub typ: Type,
-    /// Is the field indexed for search
-    pub indexed: bool,
    /// Is the field stored in the doc store
    pub stored: bool,
-    /// Is the field stored in the columnar storage
-    pub fast: bool,
+    /// Size occupied in the columnar storage (None if not fast)
+    pub fast_size: Option<ByteCount>,
+    /// term_dictionary
+    pub term_dictionary_size: Option<ByteCount>,
+    /// Size occupied in the index postings storage (None if not indexed)
+    pub postings_size: Option<ByteCount>,
+    /// Size occupied in the index postings storage (None if positions are not recorded)
+    pub positions_size: Option<ByteCount>,
 }
-impl BitOrAssign for FieldMetadata {
-    fn bitor_assign(&mut self, rhs: Self) {
-        assert!(self.field_name == rhs.field_name);
-        assert!(self.typ == rhs.typ);
-        self.indexed |= rhs.indexed;
+
+fn merge_options(left: Option<ByteCount>, right: Option<ByteCount>) -> Option<ByteCount> {
+    match (left, right) {
+        (Some(l), Some(r)) => Some(l + r),
+        (None, right) => right,
+        (left, None) => left,
+    }
+}
+
+impl FieldMetadata {
+    /// Returns true if and only if the field is indexed.
+    pub fn is_indexed(&self) -> bool {
+        self.postings_size.is_some()
+    }
+
+    /// Returns true if and only if the field is a fast field (i.e.: recorded in  columnar format).
+    pub fn is_fast(&self) -> bool {
+        self.fast_size.is_some()
+    }
+
+    /// Merges two field metadata.
+    pub fn merge(&mut self, rhs: Self) {
+        assert_eq!(self.field_name, rhs.field_name);
+        assert_eq!(self.typ, rhs.typ);
        self.stored |= rhs.stored;
-        self.fast |= rhs.fast;
+        self.fast_size = merge_options(self.fast_size, rhs.fast_size);
+        self.term_dictionary_size =
+            merge_options(self.term_dictionary_size, rhs.term_dictionary_size);
+        self.postings_size = merge_options(self.postings_size, rhs.postings_size);
+        self.positions_size = merge_options(self.positions_size, rhs.positions_size);
    }
 }

@@ -469,23 +533,29 @@ fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
 }

 /// Helper to merge the field metadata from multiple segments.
-pub fn merge_field_meta_data(
-    field_metadatas: Vec<Vec<FieldMetadata>>,
-    schema: &Schema,
-) -> Vec<FieldMetadata> {
+pub fn merge_field_meta_data(mut field_metadatas: Vec<Vec<FieldMetadata>>) -> Vec<FieldMetadata> {
+    // READ BEFORE REMOVING THIS!
+    //
+    // Because we replace field sep by `.`, fields are not always sorted.
+    // Also, to enforce such an implicit contract, we would have to add
+    // assert here.
+    //
+    // Sorting is linear time on pre-sorted data, so we are simply better off sorting data here.
+    for field_metadatas in &mut field_metadatas {
+        field_metadatas.sort_unstable();
+    }
    let mut merged_field_metadata = Vec::new();
    for (_key, mut group) in &field_metadatas
        .into_iter()
-        .kmerge_by(|left, right| left < right)
+        .kmerge()
        // TODO: Remove allocation
        .chunk_by(|el| (el.field_name.to_string(), el.typ))
    {
        let mut merged: FieldMetadata = group.next().unwrap();
        for el in group {
-            merged |= el;
+            merged.merge(el);
        }
        // Currently is_field_stored is maybe too slow for the high cardinality case
-        merged.stored = is_field_stored(&merged.field_name, schema);
        merged_field_metadata.push(merged);
    }
    merged_field_metadata
@@ -507,7 +577,7 @@ fn intersect_alive_bitset(
 }

 impl fmt::Debug for SegmentReader {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "SegmentReader({:?})", self.segment_id)
    }
 }
@@ -516,122 +586,168 @@ impl fmt::Debug for SegmentReader {
 mod test {
    use super::*;
    use crate::index::Index;
-    use crate::schema::{SchemaBuilder, Term, STORED, TEXT};
+    use crate::schema::{Term, STORED, TEXT};
    use crate::IndexWriter;

+    #[track_caller]
+    fn assert_merge(fields_metadatas: &[Vec<FieldMetadata>], expected: &[FieldMetadata]) {
+        use itertools::Itertools;
+        let num_els = fields_metadatas.len();
+        for permutation in fields_metadatas.iter().cloned().permutations(num_els) {
+            let res = merge_field_meta_data(permutation);
+            assert_eq!(&res, &expected);
+        }
+    }
+
    #[test]
-    fn test_merge_field_meta_data_same() {
-        let schema = SchemaBuilder::new().build();
+    fn test_merge_field_meta_data_same_field() {
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: true,
            stored: false,
-            fast: true,
+            term_dictionary_size: Some(ByteCount::from(100u64)),
+            postings_size: Some(ByteCount::from(1_000u64)),
+            positions_size: Some(ByteCount::from(2_000u64)),
+            fast_size: Some(ByteCount::from(1_000u64).into()),
        };
        let field_metadata2 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: true,
            stored: false,
-            fast: true,
+            term_dictionary_size: Some(ByteCount::from(80u64)),
+            postings_size: Some(ByteCount::from(1_500u64)),
+            positions_size: Some(ByteCount::from(2_500u64)),
+            fast_size: Some(ByteCount::from(3_000u64).into()),
        };
-        let res = merge_field_meta_data(
-            vec![vec![field_metadata1.clone()], vec![field_metadata2]],
-            &schema,
+        let expected = FieldMetadata {
+            field_name: "a".to_string(),
+            typ: crate::schema::Type::Str,
+            stored: false,
+            term_dictionary_size: Some(ByteCount::from(180u64)),
+            postings_size: Some(ByteCount::from(2_500u64)),
+            positions_size: Some(ByteCount::from(4_500u64)),
+            fast_size: Some(ByteCount::from(4_000u64).into()),
+        };
+        assert_merge(
+            &[vec![field_metadata1.clone()], vec![field_metadata2]],
+            &[expected],
        );
-        assert_eq!(res, vec![field_metadata1]);
    }
+
+    #[track_caller]
    #[test]
    fn test_merge_field_meta_data_different() {
-        let schema = SchemaBuilder::new().build();
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: false,
            stored: false,
-            fast: true,
+            fast_size: Some(1_000u64.into()),
+            term_dictionary_size: Some(100u64.into()),
+            postings_size: Some(2_000u64.into()),
+            positions_size: Some(4_000u64.into()),
        };
        let field_metadata2 = FieldMetadata {
            field_name: "b".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: false,
            stored: false,
-            fast: true,
+            fast_size: Some(1_002u64.into()),
+            term_dictionary_size: None,
+            postings_size: None,
+            positions_size: None,
        };
        let field_metadata3 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: true,
+            term_dictionary_size: Some(101u64.into()),
+            postings_size: Some(2_001u64.into()),
+            positions_size: Some(4_001u64.into()),
            stored: false,
-            fast: false,
+            fast_size: None,
        };
-        let res = merge_field_meta_data(
-            vec![
+        let expected = vec![
+            FieldMetadata {
+                field_name: "a".to_string(),
+                typ: crate::schema::Type::Str,
+                stored: false,
+                term_dictionary_size: Some(201u64.into()),
+                postings_size: Some(4_001u64.into()),
+                positions_size: Some(8_001u64.into()),
+                fast_size: Some(1_000u64.into()),
+            },
+            FieldMetadata {
+                field_name: "b".to_string(),
+                typ: crate::schema::Type::Str,
+                stored: false,
+                term_dictionary_size: None,
+                postings_size: None,
+                positions_size: None,
+                fast_size: Some(1_002u64.into()),
+            },
+        ];
+        assert_merge(
+            &[
                vec![field_metadata1.clone(), field_metadata2.clone()],
                vec![field_metadata3],
            ],
-            &schema,
+            &expected,
        );
-        let field_metadata_expected1 = FieldMetadata {
-            field_name: "a".to_string(),
-            typ: crate::schema::Type::Str,
-            indexed: true,
-            stored: false,
-            fast: true,
-        };
-        assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
    }
+
    #[test]
    fn test_merge_field_meta_data_merge() {
-        use pretty_assertions::assert_eq;
        let get_meta_data = |name: &str, typ: Type| FieldMetadata {
            field_name: name.to_string(),
            typ,
-            indexed: false,
+            term_dictionary_size: None,
+            postings_size: None,
+            positions_size: None,
            stored: false,
-            fast: true,
+            fast_size: Some(1u64.into()),
        };
-        let schema = SchemaBuilder::new().build();
-        let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
-        metas.sort();
-        let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
-        assert_eq!(
-            res,
-            vec![
+        let metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
+        assert_merge(
+            &[vec![get_meta_data("e", Type::Str)], metas],
+            &[
                get_meta_data("d", Type::Str),
                get_meta_data("e", Type::Str),
                get_meta_data("e", Type::U64),
-            ]
+            ],
        );
    }
+
    #[test]
    fn test_merge_field_meta_data_bitxor() {
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: false,
+            term_dictionary_size: None,
+            postings_size: None,
+            positions_size: None,
            stored: false,
-            fast: true,
+            fast_size: Some(10u64.into()),
        };
        let field_metadata2 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: true,
+            term_dictionary_size: Some(10u64.into()),
+            postings_size: Some(11u64.into()),
+            positions_size: Some(12u64.into()),
            stored: false,
-            fast: false,
+            fast_size: None,
        };
        let field_metadata_expected = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
-            indexed: true,
+            term_dictionary_size: Some(10u64.into()),
+            postings_size: Some(11u64.into()),
+            positions_size: Some(12u64.into()),
            stored: false,
-            fast: true,
+            fast_size: Some(10u64.into()),
        };
        let mut res1 = field_metadata1.clone();
-        res1 |= field_metadata2.clone();
+        res1.merge(field_metadata2.clone());
        let mut res2 = field_metadata2.clone();
-        res2 |= field_metadata1;
+        res2.merge(field_metadata1);
        assert_eq!(res1, field_metadata_expected);
        assert_eq!(res2, field_metadata_expected);
    }
@@ -662,6 +778,7 @@ mod test {
        assert_eq!(4, searcher.segment_reader(0).max_doc());
        Ok(())
    }
+
    #[test]
    fn test_alive_docs_iterator() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -370,7 +370,7 @@ impl<D: Document> IndexWriter<D> {
            .map_err(|_| error_in_index_worker_thread("Failed to join merging thread."));

        if let Err(ref e) = result {
-            error!("Some merging thread failed {:?}", e);
+            error!("Some merging thread failed {e:?}");
        }

        result
@@ -615,7 +615,7 @@ impl<D: Document> IndexWriter<D> {
    /// It is also possible to add a payload to the `commit`
    /// using this API.
    /// See [`PreparedCommit::set_payload()`].
-    pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
+    pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
        // Here, because we join all of the worker threads,
        // all of the segment update for this commit have been
        // sent.
@@ -644,7 +644,7 @@ impl<D: Document> IndexWriter<D> {

        let commit_opstamp = self.stamper.stamp();
        let prepared_commit = PreparedCommit::new(self, commit_opstamp);
-        info!("Prepared commit {}", commit_opstamp);
+        info!("Prepared commit {commit_opstamp}");
        Ok(prepared_commit)
    }

--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -61,6 +61,8 @@ type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
 #[cfg(test)]
 mod tests_mmap {

+    use common::ByteCount;
+
    use crate::aggregation::agg_req::Aggregations;
    use crate::aggregation::agg_result::AggregationResults;
    use crate::aggregation::AggregationCollector;
@@ -280,11 +282,14 @@ mod tests_mmap {
            field_name_out
        };

-        let mut fields = reader.searcher().segment_readers()[0]
+        let mut fields: Vec<(String, Type)> = reader.searcher().segment_readers()[0]
            .inverted_index(field)
            .unwrap()
-            .list_encoded_fields()
-            .unwrap();
+            .list_encoded_json_fields()
+            .unwrap()
+            .into_iter()
+            .map(|field_space| (field_space.field_name, field_space.field_type))
+            .collect();
        assert_eq!(fields.len(), 8);
        fields.sort();
        let mut expected_fields = vec![
@@ -385,7 +390,12 @@ mod tests_mmap {
        let reader = &searcher.segment_readers()[0];
        let inverted_index = reader.inverted_index(json_field).unwrap();
        assert_eq!(
-            inverted_index.list_encoded_fields().unwrap(),
+            inverted_index
+                .list_encoded_json_fields()
+                .unwrap()
+                .into_iter()
+                .map(|field_space| (field_space.field_name, field_space.field_type))
+                .collect::<Vec<_>>(),
            [
                ("k8s.container.name".to_string(), Type::Str),
                ("sub\u{1}a".to_string(), Type::I64),
@@ -402,19 +412,41 @@ mod tests_mmap {
    fn test_json_fields_metadata_expanded_dots_one_segment() {
        test_json_fields_metadata(true, true);
    }
+
    #[test]
    fn test_json_fields_metadata_expanded_dots_multi_segment() {
        test_json_fields_metadata(true, false);
    }
+
    #[test]
    fn test_json_fields_metadata_no_expanded_dots_one_segment() {
        test_json_fields_metadata(false, true);
    }
+
    #[test]
    fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
        test_json_fields_metadata(false, false);
    }

+    #[track_caller]
+    fn assert_size_eq(lhs: Option<ByteCount>, rhs: Option<ByteCount>) {
+        let ignore_actual_values = |size_opt: Option<ByteCount>| size_opt.map(|val| val > 0);
+        assert_eq!(ignore_actual_values(lhs), ignore_actual_values(rhs));
+    }
+
+    #[track_caller]
+    fn assert_field_metadata_eq_but_ignore_field_size(
+        expected: &FieldMetadata,
+        actual: &FieldMetadata,
+    ) {
+        assert_eq!(&expected.field_name, &actual.field_name);
+        assert_eq!(&expected.typ, &actual.typ);
+        assert_eq!(&expected.stored, &actual.stored);
+        assert_size_eq(expected.postings_size, actual.postings_size);
+        assert_size_eq(expected.positions_size, actual.positions_size);
+        assert_size_eq(expected.fast_size, actual.fast_size);
+    }
+
    fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
        use pretty_assertions::assert_eq;
        let mut schema_builder = Schema::builder();
@@ -453,81 +485,101 @@ mod tests_mmap {
        assert_eq!(searcher.num_docs(), 3);

        let fields_metadata = index.fields_metadata().unwrap();
-        assert_eq!(
-            fields_metadata,
-            [
-                FieldMetadata {
-                    field_name: "empty".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::U64
+
+        let expected_fields = &[
+            FieldMetadata {
+                field_name: "empty".to_string(),
+                stored: true,
+                typ: Type::U64,
+                term_dictionary_size: Some(0u64.into()),
+                fast_size: Some(1u64.into()),
+                postings_size: Some(0u64.into()),
+                positions_size: Some(0u64.into()),
+            },
+            FieldMetadata {
+                field_name: if expanded_dots {
+                    "json.shadow.k8s.container.name".to_string()
+                } else {
+                    "json.shadow.k8s\\.container\\.name".to_string()
                },
-                FieldMetadata {
-                    field_name: if expanded_dots {
-                        "json.shadow.k8s.container.name".to_string()
-                    } else {
-                        "json.shadow.k8s\\.container\\.name".to_string()
-                    },
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::Str
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.sub.a".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::I64
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.sub.b".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::I64
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.suber.a".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::I64
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.suber.a".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::Str
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.suber.b".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::I64
-                },
-                FieldMetadata {
-                    field_name: "json.shadow.val".to_string(),
-                    indexed: true,
-                    stored: true,
-                    fast: true,
-                    typ: Type::Str
-                },
-                FieldMetadata {
-                    field_name: "numbers".to_string(),
-                    indexed: false,
-                    stored: false,
-                    fast: true,
-                    typ: Type::U64
-                }
-            ]
-        );
+                stored: true,
+                typ: Type::Str,
+                term_dictionary_size: Some(1u64.into()),
+                fast_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.sub.a".to_string(),
+                typ: Type::I64,
+                stored: true,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.sub.b".to_string(),
+                typ: Type::I64,
+                stored: true,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.suber.a".to_string(),
+                stored: true,
+                typ: Type::I64,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.suber.a".to_string(),
+                typ: Type::Str,
+                stored: true,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.suber.b".to_string(),
+                typ: Type::I64,
+                stored: true,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "json.shadow.val".to_string(),
+                typ: Type::Str,
+                stored: true,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: Some(1u64.into()),
+                postings_size: Some(1u64.into()),
+                positions_size: Some(1u64.into()),
+            },
+            FieldMetadata {
+                field_name: "numbers".to_string(),
+                stored: false,
+                typ: Type::U64,
+                fast_size: Some(1u64.into()),
+                term_dictionary_size: None,
+                postings_size: None,
+                positions_size: None,
+            },
+        ];
+        assert_eq!(fields_metadata.len(), expected_fields.len());
+        for (expected, value) in expected_fields.iter().zip(fields_metadata.iter()) {
+            assert_field_metadata_eq_but_ignore_field_size(expected, value);
+        }
        let query_parser = QueryParser::for_index(&index, vec![]);
        // Test if returned field name can be queried
-        for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
+        for indexed_field in fields_metadata.iter().filter(|meta| meta.is_indexed()) {
            let val = if indexed_field.typ == Type::Str {
                "a"
            } else {
@@ -543,7 +595,10 @@ mod tests_mmap {
            }
        }
        // Test if returned field name can be used for aggregation
-        for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
+        for fast_field in fields_metadata
+            .iter()
+            .filter(|field_metadata| field_metadata.is_fast())
+        {
            let agg_req_str = json!(
            {
              "termagg": {
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -501,8 +501,7 @@ impl SegmentUpdater {
            Ok(segment_entries) => segment_entries,
            Err(err) => {
                warn!(
-                    "Starting the merge failed for the following reason. This is not fatal. {}",
-                    err
+                    "Starting the merge failed for the following reason. This is not fatal. {err}"
                );
                return err.into();
            }
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -873,8 +873,8 @@ mod tests {

        fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) {
            let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
-            assert_eq!(cols.len(), 1, "{}", field);
-            assert_eq!(cols[0].column_type(), typ, "{}", field);
+            assert_eq!(cols.len(), 1, "{field}");
+            assert_eq!(cols[0].column_type(), typ, "{field}");
        }
        assert_type(segment_reader, "json.toto", ColumnType::Str);
        assert_type(segment_reader, "json.float", ColumnType::F64);
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -55,7 +55,7 @@
 //! // between indexing threads.
 //! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
 //!
-//! // Let's index one documents!
+//! // Let's index a document!
 //! index_writer.add_document(doc!(
 //!     title => "The Old Man and the Sea",
 //!     body => "He was an old man who fished alone in a skiff in \
@@ -165,7 +165,7 @@ mod macros;
 mod future_result;

 // Re-exports
-pub use common::DateTime;
+pub use common::{ByteCount, DateTime};
 pub use {columnar, query_grammar, time};

 pub use crate::error::TantivyError;
@@ -370,6 +370,8 @@ macro_rules! fail_point {
 /// Common test utilities.
 #[cfg(test)]
 pub mod tests {
+    use std::collections::BTreeMap;
+
    use common::{BinarySerializable, FixedSize};
    use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
    use rand::distributions::{Bernoulli, Uniform};
@@ -382,7 +384,7 @@ pub mod tests {
    use crate::index::SegmentReader;
    use crate::merge_policy::NoMergePolicy;
    use crate::postings::Postings;
-    use crate::query::BooleanQuery;
+    use crate::query::{BooleanQuery, QueryParser};
    use crate::schema::*;
    use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};

@@ -1223,4 +1225,49 @@ pub mod tests {
        );
        assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
    }
+
+    #[test]
+    fn test_json_number_ambiguity() {
+        let mut schema_builder = Schema::builder();
+        let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::I64(1i64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::U64(1u64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        index_writer.commit().unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        assert_eq!(searcher.num_docs(), 3);
+        {
+            let parser = QueryParser::for_index(&index, vec![]);
+            let query = parser.parse_query("number.key:1").unwrap();
+            let count = searcher.search(&query, &crate::collector::Count).unwrap();
+            assert_eq!(count, 3);
+        }
+        {
+            let parser = QueryParser::for_index(&index, vec![]);
+            let query = parser.parse_query("number.key:1.0").unwrap();
+            let count = searcher.search(&query, &crate::collector::Count).unwrap();
+            assert_eq!(count, 3);
+        }
+    }
 }
--- a/src/positions/mod.rs
+++ b/src/positions/mod.rs
@@ -206,7 +206,7 @@ pub(crate) mod tests {
    #[test]
    fn test_position() -> crate::Result<()> {
        const CONST_VAL: u32 = 9u32;
-        let positions_delta: Vec<u32> = iter::repeat(CONST_VAL).take(2_000_000).collect();
+        let positions_delta: Vec<u32> = std::iter::repeat_n(CONST_VAL, 2_000_000).collect();
        let positions_data = create_positions_data(&positions_delta[..])?;
        assert_eq!(positions_data.len(), 1_015_627);
        let mut position_reader = PositionReader::open(positions_data)?;
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
        self.doc_decoder.output_array()
    }

-    /// Returns a full block, regardless of whether the block is complete or incomplete (
-    /// as it happens for the last block of the posting list).
-    ///
-    /// In the latter case, the block is guaranteed to be padded with the sentinel value:
-    /// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
-    ///
-    /// This method is useful to run SSE2 linear search.
-    #[inline]
-    pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
-        debug_assert!(self.block_is_loaded());
-        self.doc_decoder.full_output()
-    }
-
    /// Return the document at index `idx` of the block.
    #[inline]
    pub fn doc(&self, idx: usize) -> u32 {
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
    ///
    /// If all docs are smaller than target, the block loaded may be empty,
    /// or be the last an incomplete VInt block.
-    pub fn seek(&mut self, target_doc: DocId) {
-        self.shallow_seek(target_doc);
+    pub fn seek(&mut self, target_doc: DocId) -> usize {
+        // Move to the block that might contain our document.
+        self.seek_block(target_doc);
        self.load_block();
+
+        // At this point we are on the block that might contain our document.
+        let doc = self.doc_decoder.seek_within_block(target_doc);
+
+        // The last block is not full and padded with TERMINATED,
+        // so we are guaranteed to have at least one value (real or padding)
+        // that is >= target_doc.
+        debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
+
+        // `doc` is now the first element >= `target_doc`.
+        // If all docs are smaller than target, the current block is incomplete and padded
+        // with TERMINATED. After the search, the cursor points to the first TERMINATED.
+        doc
    }

    pub(crate) fn position_offset(&self) -> u64 {
        self.skip_reader.position_offset()
    }

-    /// Dangerous API! This calls seek on the skip list,
+    /// Dangerous API! This calls seeks the next block on the skip list,
    /// but does not `.load_block()` afterwards.
    ///
    /// `.load_block()` needs to be called manually afterwards.
    /// If all docs are smaller than target, the block loaded may be empty,
    /// or be the last an incomplete VInt block.
-    pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
+    pub(crate) fn seek_block(&mut self, target_doc: DocId) {
        if self.skip_reader.seek(target_doc) {
            self.block_max_score_cache = None;
            self.block_loaded = false;
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -151,9 +151,11 @@ impl BlockDecoder {
        &self.output[..self.output_len]
    }

+    /// Return in-block index of first value >= `target`.
+    /// Uses the padded buffer to enable branchless search.
    #[inline]
-    pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
-        &self.output
+    pub(crate) fn seek_within_block(&self, target: u32) -> usize {
+        crate::postings::branchless_binary_search(&self.output, target)
    }

    #[inline]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paul Masurel	ebb82dc549	clippy	2025-10-08 17:07:07 +02:00
PSeitz	270ca5123c	refactor postings (#2709 ) rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings	2025-10-08 16:55:25 +02:00
Mustafa S. Moiz	714366d3b9	docs: correct grammar (#2704 ) Correct phrasing for a single line in the docs (`one documents` -> `a document`).	2025-10-08 16:47:09 +02:00
PSeitz-dd	40659d4d07	improve naming in buffered_union (#2705 )	2025-09-24 10:58:46 +02:00
PSeitz	e1e131a804	add and/or queries benchmark (#2701 )	2025-09-22 16:32:49 +02:00
PSeitz-dd	70da310b2d	perf: deduplicate queries (#2698 ) * deduplicate queries Deduplicate queries in the UserInputAst after parsing queries * add return type	2025-09-22 12:16:58 +02:00
PSeitz	85010b589a	clippy (#2700 ) * clippy * clippy * clippy * clippy + fmt --------- Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-09-19 18:04:25 +02:00
PSeitz-dd	2340dca628	fix compiler warnings (#2699 ) * fix compiler warnings * fix import	2025-09-19 15:55:04 +02:00
Remi	71a26d5b24	Fix CI with rust 1.90 (#2696 ) * Empty commit * Fix dead code lint error	2025-09-18 23:06:33 +02:00
PSeitz-dd	203751f2fe	Optimize ExistsQuery for a high number of dynamic columns (#2694 ) * Optimize ExistsQuery for a high number of dynamic columns The previous algorithm checked _each_ doc in _each_ column for existence. This causes huge cost on JSON fields with e.g. 100k columns. Compute a bitset instead if we have more than one column. add `iter_docs` to the multivalued_index * add benchmark subfields=1 exists_json_union Memory: 89.3 KB (+2.01%) Avg: 0.4865ms (-26.03%) Median: 0.4865ms (-26.03%) [0.4865ms .. 0.4865ms] subfields=2 exists_json_union Memory: 68.1 KB Avg: 1.7048ms (-0.46%) Median: 1.7048ms (-0.46%) [1.7048ms .. 1.7048ms] subfields=3 exists_json_union Memory: 61.8 KB Avg: 2.0742ms (-2.22%) Median: 2.0742ms (-2.22%) [2.0742ms .. 2.0742ms] subfields=4 exists_json_union Memory: 119.8 KB (+103.44%) Avg: 3.9500ms (+42.62%) Median: 3.9500ms (+42.62%) [3.9500ms .. 3.9500ms] subfields=5 exists_json_union Memory: 120.4 KB (+107.65%) Avg: 3.9610ms (+20.65%) Median: 3.9610ms (+20.65%) [3.9610ms .. 3.9610ms] subfields=6 exists_json_union Memory: 120.6 KB (+107.49%) Avg: 3.8903ms (+3.11%) Median: 3.8903ms (+3.11%) [3.8903ms .. 3.8903ms] subfields=7 exists_json_union Memory: 120.9 KB (+106.93%) Avg: 3.6220ms (-16.22%) Median: 3.6220ms (-16.22%) [3.6220ms .. 3.6220ms] subfields=8 exists_json_union Memory: 121.3 KB (+106.23%) Avg: 4.0981ms (-15.97%) Median: 4.0981ms (-15.97%) [4.0981ms .. 4.0981ms] subfields=16 exists_json_union Memory: 123.1 KB (+103.09%) Avg: 4.3483ms (-92.26%) Median: 4.3483ms (-92.26%) [4.3483ms .. 4.3483ms] subfields=256 exists_json_union Memory: 204.6 KB (+19.85%) Avg: 3.8874ms (-99.01%) Median: 3.8874ms (-99.01%) [3.8874ms .. 3.8874ms] subfields=4096 exists_json_union Memory: 2.0 MB Avg: 3.5571ms (-99.90%) Median: 3.5571ms (-99.90%) [3.5571ms .. 3.5571ms] subfields=65536 exists_json_union Memory: 28.3 MB Avg: 14.4417ms (-99.97%) Median: 14.4417ms (-99.97%) [14.4417ms .. 14.4417ms] subfields=262144 exists_json_union Memory: 113.3 MB Avg: 66.2860ms (-99.95%) Median: 66.2860ms (-99.95%) [66.2860ms .. 66.2860ms] * rename methods	2025-09-16 18:21:03 +02:00
PSeitz-dd	7963b0b4aa	Add fast field fallback for term query if not indexed (#2693 ) * Add fast field fallback for term query if not indexed * only fallback without scores	2025-09-12 14:58:21 +02:00
Paul Masurel	d5eefca11d	Merge pull request #2692 from quickwit-oss/paul.masurel/coerce-floats-too-in-search-too This PR changes the logic used on the ingestion of floats.	2025-09-10 09:46:54 +02:00
Paul Masurel	5d6c8de23e	Align search float search logic to the columnar coercion rules It applies the same logic on floats as for u64 or i64. In all case, the idea is (for the inverted index) to coerce number to their canonical representation, before indexing and before searching. That way a document with the float 1.0 will be searchable when the user searches for 1. Note that contrary to the columnar, we do not attempt to coerce all of the terms associated to a given json path to a single numerical type. We simply rely on this "point-wise" canonicalization.	2025-09-09 19:28:17 +02:00
PSeitz	a06365f39f	Update CHANGELOG.md for bugfixes (#2674 ) * Update CHANGELOG.md * Update CHANGELOG.md	2025-09-04 11:51:00 +02:00
Raphaël Cohen	f4b374110f	feat: Regex query grammar (#2677 ) * feat: Regex query grammar * feat: Disable regexes by default * chore: Apply formatting	2025-09-03 10:07:04 +02:00
PSeitz-dd	c37af9c1ff	update release instructions (#2687 )	2025-08-22 07:57:48 +08:00
PSeitz	33794a114c	chore: Release (#2686 ) Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-08-20 18:29:37 +08:00
PSeitz-dd	8676a1f57b	prepare release: update Changelog (#2685 )	2025-08-20 16:07:53 +08:00
PSeitz-dd	021ff2ad63	move bench to binggan (#2684 )	2025-08-14 17:02:44 +08:00
Paul Masurel	39e027667b	per field size details (#2679 ) * Added per-field size details. This also does a bunch of refactoring. merging field metadata does not silently asserts that arguments should be sorted. merging does not set `stored`. We do not rely on a hashmap to group fields, but instead rely on the fact that the term dictionary is sorted. The inverted level method that exposes field metadata is not exposed as public anymore. * CR comment --------- Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>	2025-08-13 13:12:22 +02:00
PSeitz-dd	a1d65c3df3	test stable ordering with pagination (#2683 )	2025-08-13 15:36:28 +08:00
trinity-1686a	2e4615c2d3	Merge pull request #2678 from Darkheir/feat/query_grammar_space_between_field_and_value feat: Support spaces between field name and value	2025-08-11 09:57:23 +02:00
Darkheir	610091e2c4	feat: Applies PR review suggestion	2025-08-04 10:12:51 +02:00
trinity-1686a	c301e7b1c4	Merge pull request #2673 from paradedb/stuhood.fix-order-by-dup-string Fix `TopDocs::order_by_string_fast_field` for duplicates	2025-07-30 18:25:03 +02:00
Stu Hood	d9eb093368	Attempt to clarify `sorted_ords_to_term_cb`.	2025-07-29 21:56:31 -07:00
Darkheir	d4b090124c	feat: Support spaces between field name and value	2025-07-23 11:12:13 +02:00
PSeitz-dd	811c68cdb2	fix field_names in top_hits aggregation (#2675 )	2025-07-21 12:19:30 +08:00
trinity-1686a	bc1c789897	Merge pull request #2676 from quickwit-oss/trinity.pointard/allow-partial-default-field-success ignore failure to parse query when other default field suceeded	2025-07-18 14:20:41 +02:00
trinity Pointard	e7c8c331bd	ignore failure to parse query when other default field suceeded	2025-07-17 14:47:28 +02:00
Eric Ridge	2f01152a3c	adjust `Dictionary::sorted_ords_to_term_cb()` to allow duplicates	2025-07-16 13:38:43 -07:00
PSeitz	4e84c70387	Fix TopNComputer for reverse order (#2672 ) Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-07-16 21:44:04 +08:00
Paul M.	f2c77f06c5	Update fs4 to latest (0.13.1) (#2654 ) - One change was needed to handle the `Result<bool>` that now returns from `try_lock_exclusive` Co-authored-by: Paul M. <prov223@tutanota.com>	2025-07-14 11:26:19 +08:00
MassimilianoBaglioni	74334f9c9a	Fixed typo in documentation (#2629 ) Co-authored-by: Massimiliano Baglioni <massimilianobaglioni@MacBook-Air-di-Massimiliano.local>	2025-07-11 14:45:59 +08:00
Parth	cc4beb61ba	update CHANGELOG (#2670 ) * update CHANGELOG * Update CHANGELOG.md Co-authored-by: PSeitz <PSeitz@users.noreply.github.com> * Update CHANGELOG.md --------- Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>	2025-07-11 11:33:11 +08:00
Dale Seo	6742e5981b	fix a typo in the comment (#2668 )	2025-07-10 07:14:57 +02:00
Philippe Noël	b128299976	Update ParadeDB logo (#2669 )	2025-07-10 07:14:35 +02:00
PSeitz	945af922d1	clippy (#2661 ) * clippy * use readable version --------- Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-07-02 11:25:03 +02:00
PSeitz-dd	295d07e55c	fix union performance regression (#2663 ) closes https://github.com/quickwit-oss/tantivy/issues/2656	2025-07-01 20:32:25 +02:00
PSeitz	080fa4d1f4	add docs/example and Vec<u32> values to sstable (#2660 )	2025-07-01 15:40:02 +02:00
PSeitz-dd	988c2b35e7	fix import in test (#2657 )	2025-06-24 12:55:34 +02:00
PSeitz	bf3cc12610	update CHANGELOG (#2621 ) Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-06-24 11:58:44 +02:00
Stu Hood	a2400f4e73	Add string fast field support to `TopDocs`. (#2642 ) * Add string fast field support to `TopDocs`. * Remove unnecessary generics, and review feedback. * Use actual/less-ambiguous cities. * Review feedback	2025-06-20 10:27:14 +02:00
Zhang.Jinrui	436ec6caea	fix typo for the comments of search_with_executor() (#2653 ) Co-authored-by: Zhang Jinrui <zhangjinrui@microsoft.com>	2025-06-19 09:53:21 +02:00
PSeitz	4a6123d3ff	release tantivy: bump versions (#2625 ) * chore: Release * chore: Release --------- Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-06-10 15:34:39 +02:00
Parth	5a2fe42c24	make zstd optional in sstable (#2633 ) * make zstd truly optional * changelog notes * make sure we write * resolve comments * make this a default feature * remove changelog notes	2025-05-14 17:16:41 +02:00
PSeitz	5379c99ea2	update edition to 2024 (#2620 ) * update common to edition 2024 * update bitpacker to edition 2024 * update stacker to edition 2024 * update query-grammar to edition 2024 * update sstable to edition 2024 + fmt * fmt * update columnar to edition 2024 * cargo fmt * use None instead of _	2025-04-18 04:56:31 +02:00