clippy

refactor postings (#2709 )
rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings
2026-01-04 16:22:55 +00:00 · 2025-10-08 17:07:07 +02:00 · 2025-10-08 16:55:25 +02:00 · 2025-10-08 16:47:09 +02:00 · 2025-09-24 10:58:46 +02:00 · 2025-09-22 16:32:49 +02:00
53 changed files with 1386 additions and 302 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,18 @@ Tantivy 0.25
 - Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
 - Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)

+Tantivy 0.24.2
+================================
+- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz) 
+
+Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
+[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
+for `Order::Asc`
+
+Tantivy 0.24.1
+================================
+- Fix: bump required rust version to 1.81
+  
 Tantivy 0.24
 ================================
 Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
@@ -96,6 +108,14 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
 - Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
 - remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)

+Tantivy 0.22.1
+================================
+- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz) 
+
+Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
+[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
+for `Order::Asc`
+
 Tantivy 0.22
 ================================

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -167,3 +167,12 @@ harness = false
 [[bench]]
 name = "agg_bench"
 harness = false
+
+[[bench]]
+name = "exists_json"
+harness = false
+
+[[bench]]
+name = "and_or_queries"
+harness = false
+
--- a/benches/and_or_queries.rs
+++ b/benches/and_or_queries.rs
@@ -0,0 +1,224 @@
+// Benchmarks boolean conjunction queries using binggan.
+//
+// What’s measured:
+// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
+// - Nested AND/OR combinations (on multiple fields)
+// - No-scoring path using the Count collector (focus on iterator/skip performance)
+// - Top-K retrieval (k=10) using the TopDocs collector
+//
+// Corpus model:
+// - Synthetic docs; each token a/b/c is independently included per doc
+// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
+//
+// Notes:
+// - After optimization, when scoring is disabled Tantivy reads doc-only postings
+//   (IndexRecordOption::Basic), avoiding frequency decoding overhead.
+// - This bench isolates boolean iteration speed and intersection/union cost.
+// - Use `cargo bench --bench boolean_conjunction` to run.
+
+use binggan::{black_box, BenchRunner};
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
+use tantivy::collector::{Count, TopDocs};
+use tantivy::query::QueryParser;
+use tantivy::schema::{Schema, TEXT};
+use tantivy::{doc, Index, ReloadPolicy, Searcher};
+
+#[derive(Clone)]
+struct BenchIndex {
+    #[allow(dead_code)]
+    index: Index,
+    searcher: Searcher,
+    query_parser: QueryParser,
+}
+
+impl BenchIndex {
+    #[inline(always)]
+    fn count_query(&self, query_str: &str) -> usize {
+        let query = self.query_parser.parse_query(query_str).unwrap();
+        self.searcher.search(&query, &Count).unwrap()
+    }
+
+    #[inline(always)]
+    fn topk_len(&self, query_str: &str, k: usize) -> usize {
+        let query = self.query_parser.parse_query(query_str).unwrap();
+        self.searcher
+            .search(&query, &TopDocs::with_limit(k))
+            .unwrap()
+            .len()
+    }
+}
+
+/// Build a single index containing both fields (title, body) and
+/// return two BenchIndex views:
+/// - single_field: QueryParser defaults to only "body"
+/// - multi_field:  QueryParser defaults to ["title", "body"]
+fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
+    // Unified schema (two text fields)
+    let mut schema_builder = Schema::builder();
+    let f_title = schema_builder.add_text_field("title", TEXT);
+    let f_body = schema_builder.add_text_field("body", TEXT);
+    let schema = schema_builder.build();
+    let index = Index::create_in_ram(schema.clone());
+
+    // Populate index with stable RNG for reproducibility.
+    let mut rng = StdRng::from_seed([7u8; 32]);
+
+    // Populate: spread each present token 90/10 to body/title
+    {
+        let mut writer = index.writer(500_000_000).unwrap();
+        for _ in 0..num_docs {
+            let has_a = rng.gen_bool(p_a as f64);
+            let has_b = rng.gen_bool(p_b as f64);
+            let has_c = rng.gen_bool(p_c as f64);
+            let mut title_tokens: Vec<&str> = Vec::new();
+            let mut body_tokens: Vec<&str> = Vec::new();
+            if has_a {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("a");
+                } else {
+                    body_tokens.push("a");
+                }
+            }
+            if has_b {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("b");
+                } else {
+                    body_tokens.push("b");
+                }
+            }
+            if has_c {
+                if rng.gen_bool(0.1) {
+                    title_tokens.push("c");
+                } else {
+                    body_tokens.push("c");
+                }
+            }
+            if title_tokens.is_empty() && body_tokens.is_empty() {
+                body_tokens.push("z");
+            }
+            writer
+                .add_document(doc!(
+                    f_title=>title_tokens.join(" "),
+                    f_body=>body_tokens.join(" ")
+                ))
+                .unwrap();
+        }
+        writer.commit().unwrap();
+    }
+
+    // Prepare reader/searcher once.
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::Manual)
+        .try_into()
+        .unwrap();
+    let searcher = reader.searcher();
+
+    // Build two query parsers with different default fields.
+    let qp_single = QueryParser::for_index(&index, vec![f_body]);
+    let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
+
+    let single_view = BenchIndex {
+        index: index.clone(),
+        searcher: searcher.clone(),
+        query_parser: qp_single,
+    };
+    let multi_view = BenchIndex {
+        index,
+        searcher,
+        query_parser: qp_multi,
+    };
+    (single_view, multi_view)
+}
+
+fn main() {
+    // Prepare corpora with varying selectivity. Build one index per corpus
+    // and derive two views (single-field vs multi-field) from it.
+    let scenarios = vec![
+        (
+            "N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
+            1_000_000,
+            0.05,
+            0.01,
+            0.15,
+        ),
+        (
+            "N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
+            1_000_000,
+            0.01,
+            0.01,
+            0.15,
+        ),
+    ];
+
+    let mut runner = BenchRunner::new();
+    for (label, n, pa, pb, pc) in scenarios {
+        let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
+
+        // Single-field group: default field is body only
+        {
+            let mut group = runner.new_group();
+            group.set_name(format!("single_field — {}", label));
+            group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b"))
+            });
+            group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b +c"))
+            });
+            group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b", 10))
+            });
+            group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b +c", 10))
+            });
+            // OR queries
+            group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b"))
+            });
+            group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b OR c"))
+            });
+            group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b", 10))
+            });
+            group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b OR c", 10))
+            });
+            group.run();
+        }
+
+        // Multi-field group: default fields are [title, body]
+        {
+            let mut group = runner.new_group();
+            group.set_name(format!("multi_field — {}", label));
+            group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b"))
+            });
+            group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("+a +b +c"))
+            });
+            group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b", 10))
+            });
+            group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("+a +b +c", 10))
+            });
+            // OR queries
+            group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b"))
+            });
+            group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.count_query("a OR b OR c"))
+            });
+            group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b", 10))
+            });
+            group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
+                black_box(benv.topk_len("a OR b OR c", 10))
+            });
+            group.run();
+        }
+    }
+}
--- a/benches/exists_json.rs
+++ b/benches/exists_json.rs
@@ -0,0 +1,69 @@
+use binggan::plugins::PeakMemAllocPlugin;
+use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
+use serde_json::json;
+use tantivy::collector::Count;
+use tantivy::query::ExistsQuery;
+use tantivy::schema::{Schema, FAST, TEXT};
+use tantivy::{doc, Index};
+
+#[global_allocator]
+pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
+
+fn main() {
+    let doc_count: usize = 500_000;
+    let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
+
+    let indices: Vec<(String, Index)> = subfield_counts
+        .iter()
+        .map(|&sub_fields| {
+            (
+                format!("subfields={sub_fields}"),
+                build_index_with_json_subfields(doc_count, sub_fields),
+            )
+        })
+        .collect();
+
+    let mut group = InputGroup::new_with_inputs(indices);
+    group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
+
+    group.config().num_iter_group = Some(1);
+    group.config().num_iter_bench = Some(1);
+    group.register("exists_json", exists_json_union);
+
+    group.run();
+}
+
+fn exists_json_union(index: &Index) {
+    let reader = index.reader().expect("reader");
+    let searcher = reader.searcher();
+    let query = ExistsQuery::new("json".to_string(), true);
+    let count = searcher.search(&query, &Count).expect("exists search");
+    // Prevents optimizer from eliding the search
+    black_box(count);
+}
+
+fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
+    // Schema: single JSON field stored as FAST to support ExistsQuery.
+    let mut schema_builder = Schema::builder();
+    let json_field = schema_builder.add_json_field("json", TEXT | FAST);
+    let schema = schema_builder.build();
+
+    let index = Index::create_from_tempdir(schema).expect("create index");
+    {
+        let mut index_writer = index
+            .writer_with_num_threads(1, 200_000_000)
+            .expect("writer");
+        for i in 0..num_docs {
+            let sub = i % num_subfields;
+            // Only one subpath set per document; rotate subpaths so that
+            // no single subpath is full, but the union covers all docs.
+            let v = json!({ format!("field_{sub}"): i as u64 });
+            index_writer
+                .add_document(doc!(json_field => v))
+                .expect("add_document");
+        }
+        index_writer.commit().expect("commit");
+    }
+
+    index
+}
--- a/bitpacker/src/bitpacker.rs
+++ b/bitpacker/src/bitpacker.rs
@@ -48,7 +48,7 @@ impl BitPacker {

    pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
        if self.mini_buffer_written > 0 {
-            let num_bytes = (self.mini_buffer_written + 7) / 8;
+            let num_bytes = self.mini_buffer_written.div_ceil(8);
            let bytes = self.mini_buffer.to_le_bytes();
            output.write_all(&bytes[..num_bytes])?;
            self.mini_buffer_written = 0;
@@ -138,7 +138,7 @@ impl BitUnpacker {

        // We use `usize` here to avoid overflow issues.
        let end_bit_read = (end_idx as usize) * self.num_bits;
-        let end_byte_read = (end_bit_read + 7) / 8;
+        let end_byte_read = end_bit_read.div_ceil(8);
        assert!(
            end_byte_read <= data.len(),
            "Requested index is out of bounds."
--- a/bitpacker/src/blocked_bitpacker.rs
+++ b/bitpacker/src/blocked_bitpacker.rs
@@ -140,10 +140,10 @@ impl BlockedBitpacker {
    pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
        // todo performance: we could decompress a whole block and cache it instead
        let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
-        let iter = (0..bitpacked_elems)
+
+        (0..bitpacked_elems)
            .map(move |idx| self.get(idx))
-            .chain(self.buffer.iter().cloned());
-        iter
+            .chain(self.buffer.iter().cloned())
    }
 }

--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
        ColumnIndex::Full => Box::new(doc_range),
        ColumnIndex::Optional(optional_index) => Box::new(
            optional_index
-                .iter_docs()
+                .iter_non_null_docs()
                .map(move |row| row + doc_range.start),
        ),
        ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
            MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
                multivalued_index
                    .optional_index
-                    .iter_docs()
+                    .iter_non_null_docs()
                    .map(move |row| row + doc_range.start),
            ),
        },
@@ -105,10 +105,11 @@ fn get_num_values_iterator<'a>(
 ) -> Box<dyn Iterator<Item = u32> + 'a> {
    match column_index {
        ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
-        ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
-        ColumnIndex::Optional(optional_index) => {
-            Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
-        }
+        ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)),
+        ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n(
+            1u32,
+            optional_index.num_non_nulls() as usize,
+        )),
        ColumnIndex::Multivalued(multivalued_index) => Box::new(
            multivalued_index
                .get_start_index_column()
@@ -177,7 +178,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
                        ColumnIndex::Full => Box::new(columnar_row_range),
                        ColumnIndex::Optional(optional_index) => Box::new(
                            optional_index
-                                .iter_docs()
+                                .iter_non_null_docs()
                                .map(move |row_id: RowId| columnar_row_range.start + row_id),
                        ),
                        ColumnIndex::Multivalued(_) => {
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -215,6 +215,32 @@ impl MultiValueIndex {
        }
    }

+    /// Returns an iterator over document ids that have at least one value.
+    pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => {
+                let mut doc: DocId = 0u32;
+                let num_docs = idx.num_docs();
+                Box::new(std::iter::from_fn(move || {
+                    // This is not the most efficient way to do this, but it's legacy code.
+                    while doc < num_docs {
+                        let cur = doc;
+                        doc += 1;
+                        let start = idx.start_index_column.get_val(cur);
+                        let end = idx.start_index_column.get_val(cur + 1);
+                        if end > start {
+                            return Some(cur);
+                        }
+                    }
+                    None
+                }))
+            }
+            MultiValueIndex::MultiValueIndexV2(idx) => {
+                Box::new(idx.optional_index.iter_non_null_docs())
+            }
+        }
+    }
+
    /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
    /// docids. Positions are converted inplace to docids.
    ///
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -1,4 +1,4 @@
-use std::io::{self, Write};
+use std::io;
 use std::sync::Arc;

 mod set;
@@ -11,7 +11,7 @@ use set_block::{
 };

 use crate::iterable::Iterable;
-use crate::{DocId, InvalidData, RowId};
+use crate::{DocId, RowId};

 /// The threshold for for number of elements after which we switch to dense block encoding.
 ///
@@ -88,7 +88,7 @@ pub struct OptionalIndex {

 impl Iterable<u32> for &OptionalIndex {
    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
-        Box::new(self.iter_docs())
+        Box::new(self.iter_non_null_docs())
    }
 }

@@ -280,8 +280,9 @@ impl OptionalIndex {
        self.num_non_null_docs
    }

-    pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
-        // TODO optimize
+    pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
+        // TODO optimize. We could iterate over the blocks directly.
+        // We use the dense value ids and retrieve the doc ids via select.
        let mut select_batch = self.select_cursor();
        (0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
    }
@@ -334,38 +335,6 @@ enum Block<'a> {
    Sparse(SparseBlock<'a>),
 }

-#[derive(Debug, Copy, Clone)]
-enum OptionalIndexCodec {
-    Dense = 0,
-    Sparse = 1,
-}
-
-impl OptionalIndexCodec {
-    fn to_code(self) -> u8 {
-        self as u8
-    }
-
-    fn try_from_code(code: u8) -> Result<Self, InvalidData> {
-        match code {
-            0 => Ok(Self::Dense),
-            1 => Ok(Self::Sparse),
-            _ => Err(InvalidData),
-        }
-    }
-}
-
-impl BinarySerializable for OptionalIndexCodec {
-    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
-        writer.write_all(&[self.to_code()])
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let optional_codec_code = u8::deserialize(reader)?;
-        let optional_codec = Self::try_from_code(optional_codec_code)?;
-        Ok(optional_codec)
-    }
-}
-
 fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
    let is_sparse = is_sparse(block_els.len() as u32);
    if is_sparse {
--- a/columnar/src/column_index/optional_index/tests.rs
+++ b/columnar/src/column_index/optional_index/tests.rs
@@ -164,7 +164,11 @@ fn test_optional_index_large() {
 fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
    let optional_index = OptionalIndex::for_test(num_rows, row_ids);
    assert_eq!(optional_index.num_docs(), num_rows);
-    assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
+    assert!(
+        optional_index
+            .iter_non_null_docs()
+            .eq(row_ids.iter().copied())
+    );
 }

 #[test]
--- a/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
+++ b/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
        let mut covered_space = Vec::with_capacity(self.blanks.len());

        // beginning of the blanks
-        if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
-            if *first_blank_start != 0 {
-                covered_space.push(0..=first_blank_start - 1);
-            }
+        if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start)
+            && *first_blank_start != 0
+        {
+            covered_space.push(0..=first_blank_start - 1);
        }

        // Between the blanks
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
        covered_space.extend(between_blanks);

        // end of the blanks
-        if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
-            if *last_blank_end != u128::MAX {
-                covered_space.push(last_blank_end + 1..=u128::MAX);
-            }
+        if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end)
+            && *last_blank_end != u128::MAX
+        {
+            covered_space.push(last_blank_end + 1..=u128::MAX);
        }

        if covered_space.is_empty() {
--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -105,7 +105,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {

    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
        let num_bits_per_value = num_bits(stats);
-        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
+        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8))
    }

    fn serialize(
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
        Some(
            stats.num_bytes()
                + linear_params.num_bytes()
-                + (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
+                + (num_bits as u64 * stats.num_rows as u64).div_ceil(8),
        )
    }

--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -367,7 +367,7 @@ fn is_empty_after_merge(
                    ColumnIndex::Empty { .. } => true,
                    ColumnIndex::Full => alive_bitset.len() == 0,
                    ColumnIndex::Optional(optional_index) => {
-                        for doc in optional_index.iter_docs() {
+                        for doc in optional_index.iter_non_null_docs() {
                            if alive_bitset.contains(doc) {
                                return false;
                            }
--- a/columnar/src/columnar/writer/column_operation.rs
+++ b/columnar/src/columnar/writer/column_operation.rs
@@ -244,7 +244,7 @@ impl SymbolValue for UnorderedId {

 fn compute_num_bytes_for_u64(val: u64) -> usize {
    let msb = (64u32 - val.leading_zeros()) as usize;
-    (msb + 7) / 8
+    msb.div_ceil(8)
 }

 fn encode_zig_zag(n: i64) -> u64 {
--- a/columnar/src/value.rs
+++ b/columnar/src/value.rs
@@ -1,3 +1,5 @@
+use std::str::FromStr;
+
 use common::DateTime;

 use crate::InvalidData;
@@ -9,6 +11,23 @@ pub enum NumericalValue {
    F64(f64),
 }

+impl FromStr for NumericalValue {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, ()> {
+        if let Ok(val_i64) = s.parse::<i64>() {
+            return Ok(val_i64.into());
+        }
+        if let Ok(val_u64) = s.parse::<u64>() {
+            return Ok(val_u64.into());
+        }
+        if let Ok(val_f64) = s.parse::<f64>() {
+            return Ok(NumericalValue::from(val_f64).normalize());
+        }
+        Err(())
+    }
+}
+
 impl NumericalValue {
    pub fn numerical_type(&self) -> NumericalType {
        match self {
@@ -26,7 +45,7 @@ impl NumericalValue {
                if val <= i64::MAX as u64 {
                    NumericalValue::I64(val as i64)
                } else {
-                    NumericalValue::F64(val as f64)
+                    NumericalValue::U64(val)
                }
            }
            NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
 #[cfg(test)]
 mod tests {
    use super::NumericalType;
+    use crate::NumericalValue;

    #[test]
    fn test_numerical_type_code() {
@@ -153,4 +173,58 @@ mod tests {
        }
        assert_eq!(num_numerical_type, 3);
    }
+
+    #[test]
+    fn test_parse_numerical() {
+        assert_eq!(
+            "123".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(123)
+        );
+        assert_eq!(
+            "18446744073709551615".parse::<NumericalValue>().unwrap(),
+            NumericalValue::U64(18446744073709551615u64)
+        );
+        assert_eq!(
+            "1.0".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(1i64)
+        );
+        assert_eq!(
+            "1.1".parse::<NumericalValue>().unwrap(),
+            NumericalValue::F64(1.1f64)
+        );
+        assert_eq!(
+            "-1.0".parse::<NumericalValue>().unwrap(),
+            NumericalValue::I64(-1i64)
+        );
+    }
+
+    #[test]
+    fn test_normalize_numerical() {
+        assert_eq!(
+            NumericalValue::from(1u64).normalize(),
+            NumericalValue::I64(1i64),
+        );
+        let limit_val = i64::MAX as u64 + 1u64;
+        assert_eq!(
+            NumericalValue::from(limit_val).normalize(),
+            NumericalValue::U64(limit_val),
+        );
+        assert_eq!(
+            NumericalValue::from(-1i64).normalize(),
+            NumericalValue::I64(-1i64),
+        );
+        assert_eq!(
+            NumericalValue::from(-2.0f64).normalize(),
+            NumericalValue::I64(-2i64),
+        );
+        assert_eq!(
+            NumericalValue::from(-2.1f64).normalize(),
+            NumericalValue::F64(-2.1f64),
+        );
+        let large_float = 2.0f64.powf(70.0f64);
+        assert_eq!(
+            NumericalValue::from(large_float).normalize(),
+            NumericalValue::F64(large_float),
+        );
+    }
 }
--- a/common/src/bitset.rs
+++ b/common/src/bitset.rs
@@ -183,7 +183,7 @@ pub struct BitSet {
 }

 fn num_buckets(max_val: u32) -> u32 {
-    (max_val + 63u32) / 64u32
+    max_val.div_ceil(64u32)
 }

 impl BitSet {
--- a/common/src/vint.rs
+++ b/common/src/vint.rs
@@ -29,6 +29,7 @@ impl BinarySerializable for VIntU128 {
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        #[allow(clippy::unbuffered_bytes)]
        let mut bytes = reader.bytes();
        let mut result = 0u128;
        let mut shift = 0u64;
@@ -52,7 +53,7 @@ impl BinarySerializable for VIntU128 {
    }
 }

-///   Wrapper over a `u64` that serializes as a variable int.
+/// Wrapper over a `u64` that serializes as a variable int.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct VInt(pub u64);

@@ -196,6 +197,7 @@ impl BinarySerializable for VInt {
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        #[allow(clippy::unbuffered_bytes)]
        let mut bytes = reader.bytes();
        let mut result = 0u64;
        let mut shift = 0u64;
--- a/query-grammar/Cargo.toml
+++ b/query-grammar/Cargo.toml
@@ -15,3 +15,5 @@ edition = "2024"
 nom = "7"
 serde = { version = "1.0.219", features = ["derive"] }
 serde_json = "1.0.140"
+ordered-float = "5.0.0"
+fnv = "1.0.7"
--- a/query-grammar/src/infallible.rs
+++ b/query-grammar/src/infallible.rs
@@ -117,6 +117,22 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
    }
 }

+pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
+    mut first: F,
+    mut second: G,
+) -> impl FnMut(I) -> JResult<I, O1>
+where
+    F: nom::Parser<I, (O1, ErrorList), Infallible>,
+    G: nom::Parser<I, (O2, ErrorList), Infallible>,
+{
+    move |input: I| {
+        let (input, (o1, mut err)) = first.parse(input)?;
+        let (input, (_, mut err2)) = second.parse(input)?;
+        err.append(&mut err2);
+        Ok((input, (o1, err)))
+    }
+}
+
 pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
    mut first: F,
    mut second: G,
--- a/query-grammar/src/lib.rs
+++ b/query-grammar/src/lib.rs
@@ -31,7 +31,17 @@ pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {

 #[cfg(test)]
 mod tests {
-    use crate::{parse_query, parse_query_lenient};
+    use crate::{UserInputAst, parse_query, parse_query_lenient};
+
+    #[test]
+    fn test_deduplication() {
+        let ast: UserInputAst = parse_query("a a").unwrap();
+        let json = serde_json::to_string(&ast).unwrap();
+        assert_eq!(
+            json,
+            r#"{"type":"bool","clauses":[[null,{"type":"literal","field_name":null,"phrase":"a","delimiter":"none","slop":0,"prefix":false}]]}"#
+        );
+    }

    #[test]
    fn test_parse_query_serialization() {
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -1,6 +1,7 @@
 use std::borrow::Cow;
 use std::iter::once;

+use fnv::FnvHashSet;
 use nom::IResult;
 use nom::branch::alt;
 use nom::bytes::complete::tag;
@@ -68,7 +69,7 @@ fn interpret_escape(source: &str) -> String {

 /// Consume a word outside of any context.
 // TODO should support escape sequences
-fn word(inp: &str) -> IResult<&str, Cow<str>> {
+fn word(inp: &str) -> IResult<&str, Cow<'_, str>> {
    map_res(
        recognize(tuple((
            alt((
@@ -305,15 +306,14 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
    let (inp, (field_name, _, _, _)) =
        tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");

-    let res = delimited_infallible(
+    delimited_infallible(
        nothing,
        map(ast_infallible, |(mut ast, errors)| {
            ast.set_default_field(field_name.to_string());
            (ast, errors)
        }),
        opt_i_err(char(')'), "expected ')'"),
-    )(inp);
-    res
+    )(inp)
 }

 fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
@@ -367,7 +367,10 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
    // something (a field name) got parsed before
    alt((
        map(
-            tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
+            tuple((
+                opt(field_name),
+                alt((range, set, exists, regex, term_or_phrase)),
+            )),
            |(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
        ),
        term_group,
@@ -389,6 +392,10 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
                        value((), peek(one_of("{[><"))),
                        map(range_infallible, |(range, errs)| (Some(range), errs)),
                    ),
+                    (
+                        value((), peek(one_of("/"))),
+                        map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
+                    ),
                ),
                delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
            ),
@@ -689,6 +696,61 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
    }
 }

+fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
+    map(
+        terminated(
+            delimited(
+                char('/'),
+                many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
+                char('/'),
+            ),
+            peek(alt((multispace1, eof))),
+        ),
+        |elements| UserInputLeaf::Regex {
+            field: None,
+            pattern: elements.into_iter().collect::<String>(),
+        },
+    )(inp)
+}
+
+fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
+    match terminated_infallible(
+        delimited_infallible(
+            opt_i_err(char('/'), "missing delimiter /"),
+            opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
+            opt_i_err(char('/'), "missing delimiter /"),
+        ),
+        opt_i_err(
+            peek(alt((multispace1, eof))),
+            "expected whitespace or end of input",
+        ),
+    )(inp)
+    {
+        Ok((rest, (elements_part, errors))) => {
+            let pattern = match elements_part {
+                Some(elements_part) => elements_part.into_iter().collect(),
+                None => String::new(),
+            };
+            let res = UserInputLeaf::Regex {
+                field: None,
+                pattern,
+            };
+            Ok((rest, (res, errors)))
+        }
+        Err(e) => {
+            let errs = vec![LenientErrorInternal {
+                pos: inp.len(),
+                message: e.to_string(),
+            }];
+            let res = UserInputLeaf::Regex {
+                field: None,
+                pattern: String::new(),
+            };
+            Ok((inp, (res, errs)))
+        }
+    }
+}
+
 fn negate(expr: UserInputAst) -> UserInputAst {
    expr.unary(Occur::MustNot)
 }
@@ -753,7 +815,7 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
        tuple((leaf, fallible(boost))),
        |(leaf, boost_opt)| match boost_opt {
            Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
-                UserInputAst::Boost(Box::new(leaf), boost)
+                UserInputAst::Boost(Box::new(leaf), boost.into())
            }
            _ => leaf,
        },
@@ -765,7 +827,7 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
        tuple_infallible((leaf_infallible, boost)),
        |((leaf, boost_opt), error)| match boost_opt {
            Some(boost) if (boost - 1.0).abs() > f64::EPSILON => (
-                leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)),
+                leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost.into())),
                error,
            ),
            _ => (leaf, error),
@@ -1016,12 +1078,25 @@ pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>
    (rewrite_ast(res), errors)
 }

-/// Removes unnecessary children clauses in AST
-///
-/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
 fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
-    if let UserInputAst::Clause(terms) = &mut input {
-        for term in terms {
+    if let UserInputAst::Clause(sub_clauses) = &mut input {
+        // call rewrite_ast recursively on children clauses if applicable
+        let mut new_clauses = Vec::with_capacity(sub_clauses.len());
+        for (occur, clause) in sub_clauses.drain(..) {
+            let rewritten_clause = rewrite_ast(clause);
+            new_clauses.push((occur, rewritten_clause));
+        }
+        *sub_clauses = new_clauses;
+
+        // remove duplicate child clauses
+        // e.g. (+a +b) OR (+c +d) OR (+a +b)  => (+a +b) OR (+c +d)
+        let mut seen = FnvHashSet::default();
+        sub_clauses.retain(|term| seen.insert(term.clone()));
+
+        // Removes unnecessary children clauses in AST
+        //
+        // Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
+        for term in sub_clauses {
            rewrite_ast_clause(term);
        }
    }
@@ -1694,6 +1769,63 @@ mod test {
        test_is_parse_err(r#"!bc:def"#, "!bc:def");
    }

+    #[test]
+    fn test_regex_parser() {
+        let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
+        assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
+        let (_, input) = r.unwrap();
+        match input {
+            UserInputAst::Leaf(leaf) => match leaf.as_ref() {
+                UserInputLeaf::Regex { field, pattern } => {
+                    assert_eq!(field, &Some("a".to_string()));
+                    assert_eq!(pattern, "joh?n(ath[oa]n)");
+                }
+                _ => panic!("Expected a regex leaf, got {leaf:?}"),
+            },
+            _ => panic!("Expected a leaf"),
+        }
+        let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
+        assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
+        let (_, input) = r.unwrap();
+        match input {
+            UserInputAst::Leaf(leaf) => match leaf.as_ref() {
+                UserInputLeaf::Regex { field, pattern } => {
+                    assert_eq!(field, &Some("a".to_string()));
+                    assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
+                }
+                _ => panic!("Expected a regex leaf, got {leaf:?}"),
+            },
+            _ => panic!("Expected a leaf"),
+        }
+    }
+
+    #[test]
+    fn test_regex_parser_lenient() {
+        let literal = |query| literal_infallible(query).unwrap().1;
+
+        let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
+        let expected = UserInputLeaf::Regex {
+            field: Some("a".to_string()),
+            pattern: "joh?n(ath[oa]n)".to_string(),
+        }
+        .into();
+        assert_eq!(res.unwrap(), expected);
+        assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
+
+        let (res, errs) = literal("title:/joh?n(ath[oa]n)");
+        let expected = UserInputLeaf::Regex {
+            field: Some("title".to_string()),
+            pattern: "joh?n(ath[oa]n)".to_string(),
+        }
+        .into();
+        assert_eq!(res.unwrap(), expected);
+        assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
+        assert_eq!(
+            errs[0].message, "missing delimiter /",
+            "Unexpected error message",
+        );
+    }
+
    #[test]
    fn test_space_before_value() {
        test_parse_query_to_ast_helper("field : a", r#""field":a"#);
--- a/query-grammar/src/user_input_ast.rs
+++ b/query-grammar/src/user_input_ast.rs
@@ -5,7 +5,7 @@ use serde::Serialize;

 use crate::Occur;

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(tag = "type")]
 #[serde(rename_all = "snake_case")]
 pub enum UserInputLeaf {
@@ -23,6 +23,10 @@ pub enum UserInputLeaf {
    Exists {
        field: String,
    },
+    Regex {
+        field: Option<String>,
+        pattern: String,
+    },
 }

 impl UserInputLeaf {
@@ -46,6 +50,7 @@ impl UserInputLeaf {
            UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
                field: field.expect("Exist query without a field isn't allowed"),
            },
+            UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
        }
    }

@@ -103,11 +108,19 @@ impl Debug for UserInputLeaf {
            UserInputLeaf::Exists { field } => {
                write!(formatter, "$exists(\"{field}\")")
            }
+            UserInputLeaf::Regex { field, pattern } => {
+                if let Some(field) = field {
+                    // TODO properly escape field (in case of \")
+                    write!(formatter, "\"{field}\":")?;
+                }
+                // TODO properly escape pattern (in case of \")
+                write!(formatter, "/{pattern}/")
+            }
        }
    }
 }

-#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub enum Delimiter {
    SingleQuotes,
@@ -115,7 +128,7 @@ pub enum Delimiter {
    None,
 }

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct UserInputLiteral {
    pub field_name: Option<String>,
@@ -154,7 +167,7 @@ impl fmt::Debug for UserInputLiteral {
    }
 }

-#[derive(PartialEq, Debug, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Debug, Clone, Serialize)]
 #[serde(tag = "type", content = "value")]
 #[serde(rename_all = "snake_case")]
 pub enum UserInputBound {
@@ -191,11 +204,11 @@ impl UserInputBound {
    }
 }

-#[derive(PartialEq, Clone, Serialize)]
+#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
 #[serde(into = "UserInputAstSerde")]
 pub enum UserInputAst {
    Clause(Vec<(Option<Occur>, UserInputAst)>),
-    Boost(Box<UserInputAst>, f64),
+    Boost(Box<UserInputAst>, ordered_float::OrderedFloat<f64>),
    Leaf(Box<UserInputLeaf>),
 }

@@ -217,9 +230,10 @@ impl From<UserInputAst> for UserInputAstSerde {
    fn from(ast: UserInputAst) -> Self {
        match ast {
            UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
-            UserInputAst::Boost(underlying, boost) => {
-                UserInputAstSerde::Boost { underlying, boost }
-            }
+            UserInputAst::Boost(underlying, boost) => UserInputAstSerde::Boost {
+                underlying,
+                boost: boost.into_inner(),
+            },
            UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
        }
    }
@@ -378,7 +392,7 @@ mod tests {
    #[test]
    fn test_boost_serialization() {
        let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
-        let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
+        let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5.into());
        let json = serde_json::to_string(&boost_ast).unwrap();
        assert_eq!(
            json,
@@ -405,7 +419,7 @@ mod tests {
                    }))),
                ),
            ])),
-            2.5,
+            2.5.into(),
        );
        let json = serde_json::to_string(&boost_ast).unwrap();
        assert_eq!(
--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -155,7 +155,7 @@ fn test_aggregation_flushing(
        searcher.search(&AllQuery, &collector).unwrap()
    };

-    let res: Value = serde_json::to_value(&agg_res)?;
+    let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;

    assert_eq!(res["bucketsL1"]["buckets"][0]["doc_count"], 3);
    assert_eq!(
@@ -270,7 +270,7 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
    let searcher = reader.searcher();
    let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();

-    let res: Value = serde_json::to_value(&agg_res)?;
+    let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
    assert_eq!(res["average"]["value"], 12.142857142857142);
    assert_eq!(
        res["range"]["buckets"],
@@ -304,29 +304,6 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
    Ok(())
 }

-#[test]
-fn test_aggregation_term_truncate_sum_other_doc_count() {
-    let index = get_test_index_2_segments(true).unwrap();
-    let reader = index.reader().unwrap();
-    let count_per_text: Aggregation = serde_json::from_value(json!({ "terms": { "field": "text", "size": 1 } })).unwrap();
-    let aggs: Aggregations = vec![("group_by_term_truncate".to_string(),  count_per_text)]
-        .into_iter()
-        .collect();
-
-    let collector = get_collector(aggs);
-    let searcher = reader.searcher();
-    let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
-
-    let res: Value = serde_json::to_value(&agg_res).unwrap();
-    assert_eq!(res, serde_json::json!({
-        "group_by_term_truncate": {
-             "buckets": [{ "doc_count": 7, "key": "cool" }],
-             "doc_count_error_upper_bound": 0,
-             "sum_other_doc_count": 2,
-         },
-    }));
-}
-
 #[test]
 fn test_aggregation_level1() -> crate::Result<()> {
    let index = get_test_index_2_segments(true)?;
@@ -365,7 +342,7 @@ fn test_aggregation_level1() -> crate::Result<()> {
    let searcher = reader.searcher();
    let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();

-    let res: Value = serde_json::to_value(&agg_res)?;
+    let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
    assert_eq!(res["average"]["value"], 12.142857142857142);
    assert_eq!(res["average_f64"]["value"], 12.214285714285714);
    assert_eq!(res["average_i64"]["value"], 12.142857142857142);
@@ -420,7 +397,7 @@ fn test_aggregation_level2(
        IndexRecordOption::Basic,
    );

-    let elasticsearch_compatible_json_req = serde_json::json!(
+    let elasticsearch_compatible_json_req = r#"
 {
  "rangef64": {
    "range": {
@@ -473,8 +450,9 @@ fn test_aggregation_level2(
      "term_agg": { "terms": { "field": "text" } }
    }
  }
-});
-    let agg_req: Aggregations = serde_json::from_value(elasticsearch_compatible_json_req).unwrap();
+}
+"#;
+    let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();

    let agg_res: AggregationResults = if use_distributed_collector {
        let collector =
@@ -491,7 +469,7 @@ fn test_aggregation_level2(
        searcher.search(&term_query, &collector).unwrap()
    };

-    let res: Value = serde_json::to_value(agg_res)?;
+    let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;

    assert_eq!(res["range"]["buckets"][1]["key"], "3-7");
    assert_eq!(res["range"]["buckets"][1]["doc_count"], 2u64);
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@@ -301,7 +301,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
        let bounds = self.bounds;
        let interval = self.interval;
        let offset = self.offset;
-        let get_bucket_pos = |val| (get_bucket_pos_f64(val, interval, offset) as i64);
+        let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;

        bucket_agg_accessor
            .column_block_accessor
--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -484,7 +484,6 @@ impl FacetCounts {
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeSet;
-    use std::iter;

    use columnar::Dictionary;
    use rand::distributions::Uniform;
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -1,3 +1,4 @@
+use columnar::NumericalValue;
 use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
 use common::{replace_in_place, JsonPathWriter};
 use rustc_hash::FxHashMap;
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
                if let Ok(i64_val) = val.try_into() {
                    term_buffer.append_type_and_fast_value::<i64>(i64_val);
                } else {
-                    term_buffer.append_type_and_fast_value(val);
+                    term_buffer.append_type_and_fast_value::<u64>(val);
                }
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
            ReferenceValueLeaf::F64(val) => {
+                if !val.is_finite() {
+                    return;
+                };
                set_path_id(
                    term_buffer,
                    ctx.path_to_unordered_id
                        .get_or_allocate_unordered_id(json_path_writer.as_str()),
                );
-                term_buffer.append_type_and_fast_value(val);
+                // Normalize here is important.
+                // In the inverted index, we coerce all numerical values to their canonical
+                // representation.
+                //
+                // (We do the same thing on the query side)
+                match NumericalValue::F64(val).normalize() {
+                    NumericalValue::I64(val_i64) => {
+                        term_buffer.append_type_and_fast_value::<i64>(val_i64);
+                    }
+                    NumericalValue::U64(val_u64) => {
+                        term_buffer.append_type_and_fast_value::<u64>(val_u64);
+                    }
+                    NumericalValue::F64(val_f64) => {
+                        term_buffer.append_type_and_fast_value::<f64>(val_f64);
+                    }
+                }
                postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
            }
            ReferenceValueLeaf::Bool(val) => {
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
 ///
 /// The term must be json + JSON path.
 pub fn convert_to_fast_value_and_append_to_json_term(
-    mut term: Term,
-    phrase: &str,
+    term: &Term,
+    text: &str,
    truncate_date_for_search: bool,
 ) -> Option<Term> {
    assert_eq!(
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
        0,
        "JSON value bytes should be empty"
    );
-    if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
-        let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
-        if truncate_date_for_search {
-            dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
+    try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
+        .or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
+        .or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
+}
+
+fn try_convert_to_datetime_and_append_to_json_term(
+    term: &Term,
+    text: &str,
+    truncate_date_for_search: bool,
+) -> Option<Term> {
+    let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
+    let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
+    if truncate_date_for_search {
+        dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
+    }
+    let mut term_clone = term.clone();
+    term_clone.append_type_and_fast_value(dt);
+    Some(term_clone)
+}
+
+fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
+    let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
+    let mut term_clone = term.clone();
+    // Parse is actually returning normalized values already today, but let's not
+    // not rely on that hidden contract.
+    match numerical_value.normalize() {
+        NumericalValue::I64(i64_value) => {
+            term_clone.append_type_and_fast_value::<i64>(i64_value);
+        }
+        NumericalValue::U64(u64_value) => {
+            term_clone.append_type_and_fast_value::<u64>(u64_value);
+        }
+        NumericalValue::F64(f64_value) => {
+            term_clone.append_type_and_fast_value::<f64>(f64_value);
        }
-        term.append_type_and_fast_value(dt);
-        return Some(term);
    }
-    if let Ok(i64_val) = str::parse::<i64>(phrase) {
-        term.append_type_and_fast_value(i64_val);
-        return Some(term);
-    }
-    if let Ok(u64_val) = str::parse::<u64>(phrase) {
-        term.append_type_and_fast_value(u64_val);
-        return Some(term);
-    }
-    if let Ok(f64_val) = str::parse::<f64>(phrase) {
-        term.append_type_and_fast_value(f64_val);
-        return Some(term);
-    }
-    if let Ok(bool_val) = str::parse::<bool>(phrase) {
-        term.append_type_and_fast_value(bool_val);
-        return Some(term);
-    }
-    None
+    Some(term_clone)
+}
+
+fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
+    let val = str::parse::<bool>(text).ok()?;
+    let mut term_clone = term.clone();
+    term_clone.append_type_and_fast_value(val);
+    Some(term_clone)
 }

 /// Splits a json path supplied to the query parser in such a way that
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -484,10 +484,8 @@ impl Directory for MmapDirectory {
            .map_err(LockError::wrap_io_error)?;
        if lock.is_blocking {
            file.lock_exclusive().map_err(LockError::wrap_io_error)?;
-        } else {
-            if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
-                return Err(LockError::LockBusy);
-            }
+        } else if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
+            return Err(LockError::LockBusy);
        }
        // dropping the file handle will release the lock.
        Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
--- a/src/index/inverted_index_reader.rs
+++ b/src/index/inverted_index_reader.rs
@@ -146,7 +146,7 @@ impl InvertedIndexReader {
                positions_size: ByteCount::default(),
                num_terms: 0u64,
            };
-            field_space.record(&term_info);
+            field_space.record(term_info);

            // We include the json type and the json end of path to make sure the prefix check
            // is meaningful.
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -615,7 +615,7 @@ impl<D: Document> IndexWriter<D> {
    /// It is also possible to add a payload to the `commit`
    /// using this API.
    /// See [`PreparedCommit::set_payload()`].
-    pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
+    pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
        // Here, because we join all of the worker threads,
        // all of the segment update for this commit have been
        // sent.
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -55,7 +55,7 @@
 //! // between indexing threads.
 //! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
 //!
-//! // Let's index one documents!
+//! // Let's index a document!
 //! index_writer.add_document(doc!(
 //!     title => "The Old Man and the Sea",
 //!     body => "He was an old man who fished alone in a skiff in \
@@ -370,6 +370,8 @@ macro_rules! fail_point {
 /// Common test utilities.
 #[cfg(test)]
 pub mod tests {
+    use std::collections::BTreeMap;
+
    use common::{BinarySerializable, FixedSize};
    use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
    use rand::distributions::{Bernoulli, Uniform};
@@ -382,7 +384,7 @@ pub mod tests {
    use crate::index::SegmentReader;
    use crate::merge_policy::NoMergePolicy;
    use crate::postings::Postings;
-    use crate::query::BooleanQuery;
+    use crate::query::{BooleanQuery, QueryParser};
    use crate::schema::*;
    use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};

@@ -1223,4 +1225,49 @@ pub mod tests {
        );
        assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
    }
+
+    #[test]
+    fn test_json_number_ambiguity() {
+        let mut schema_builder = Schema::builder();
+        let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::I64(1i64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::U64(1u64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        {
+            let mut doc = TantivyDocument::new();
+            let mut obj = BTreeMap::default();
+            obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
+            doc.add_object(json_field, obj);
+            index_writer.add_document(doc).unwrap();
+        }
+        index_writer.commit().unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        assert_eq!(searcher.num_docs(), 3);
+        {
+            let parser = QueryParser::for_index(&index, vec![]);
+            let query = parser.parse_query("number.key:1").unwrap();
+            let count = searcher.search(&query, &crate::collector::Count).unwrap();
+            assert_eq!(count, 3);
+        }
+        {
+            let parser = QueryParser::for_index(&index, vec![]);
+            let query = parser.parse_query("number.key:1.0").unwrap();
+            let count = searcher.search(&query, &crate::collector::Count).unwrap();
+            assert_eq!(count, 3);
+        }
+    }
 }
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
        self.doc_decoder.output_array()
    }

-    /// Returns a full block, regardless of whether the block is complete or incomplete (
-    /// as it happens for the last block of the posting list).
-    ///
-    /// In the latter case, the block is guaranteed to be padded with the sentinel value:
-    /// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
-    ///
-    /// This method is useful to run SSE2 linear search.
-    #[inline]
-    pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
-        debug_assert!(self.block_is_loaded());
-        self.doc_decoder.full_output()
-    }
-
    /// Return the document at index `idx` of the block.
    #[inline]
    pub fn doc(&self, idx: usize) -> u32 {
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
    ///
    /// If all docs are smaller than target, the block loaded may be empty,
    /// or be the last an incomplete VInt block.
-    pub fn seek(&mut self, target_doc: DocId) {
-        self.shallow_seek(target_doc);
+    pub fn seek(&mut self, target_doc: DocId) -> usize {
+        // Move to the block that might contain our document.
+        self.seek_block(target_doc);
        self.load_block();
+
+        // At this point we are on the block that might contain our document.
+        let doc = self.doc_decoder.seek_within_block(target_doc);
+
+        // The last block is not full and padded with TERMINATED,
+        // so we are guaranteed to have at least one value (real or padding)
+        // that is >= target_doc.
+        debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
+
+        // `doc` is now the first element >= `target_doc`.
+        // If all docs are smaller than target, the current block is incomplete and padded
+        // with TERMINATED. After the search, the cursor points to the first TERMINATED.
+        doc
    }

    pub(crate) fn position_offset(&self) -> u64 {
        self.skip_reader.position_offset()
    }

-    /// Dangerous API! This calls seek on the skip list,
+    /// Dangerous API! This calls seeks the next block on the skip list,
    /// but does not `.load_block()` afterwards.
    ///
    /// `.load_block()` needs to be called manually afterwards.
    /// If all docs are smaller than target, the block loaded may be empty,
    /// or be the last an incomplete VInt block.
-    pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
+    pub(crate) fn seek_block(&mut self, target_doc: DocId) {
        if self.skip_reader.seek(target_doc) {
            self.block_max_score_cache = None;
            self.block_loaded = false;
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -151,9 +151,11 @@ impl BlockDecoder {
        &self.output[..self.output_len]
    }

+    /// Return in-block index of first value >= `target`.
+    /// Uses the padded buffer to enable branchless search.
    #[inline]
-    pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
-        &self.output
+    pub(crate) fn seek_within_block(&self, target: u32) -> usize {
+        crate::postings::branchless_binary_search(&self.output, target)
    }

    #[inline]
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -4,7 +4,7 @@ use crate::docset::DocSet;
 use crate::fastfield::AliveBitSet;
 use crate::positions::PositionReader;
 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
-use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
+use crate::postings::{BlockSegmentPostings, Postings};
 use crate::{DocId, TERMINATED};

 /// `SegmentPostings` represents the inverted list or postings associated with
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
            return self.doc();
        }

-        self.block_cursor.seek(target);
-
-        // At this point we are on the block, that might contain our document.
-        let output = self.block_cursor.full_block();
-        self.cur = branchless_binary_search(output, target);
-
-        // The last block is not full and padded with the value TERMINATED,
-        // so that we are guaranteed to have at least doc in the block (a real one or the padding)
-        // that is greater or equal to the target.
-        debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
-
-        // `doc` is now the first element >= `target`
-
-        // If all docs are smaller than target the current block should be incomplemented and padded
-        // with the value `TERMINATED`.
-        //
-        // After the search, the cursor should point to the first value of TERMINATED.
-        let doc = output[self.cur];
+        // Delegate block-local search to BlockSegmentPostings::seek, which returns
+        // the in-block index of the first doc >= target.
+        self.cur = self.block_cursor.seek(target);
+        let doc = self.doc();
        debug_assert!(doc >= target);
-        debug_assert_eq!(doc, self.doc());
        doc
    }

--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -75,7 +75,7 @@ impl InvertedIndexSerializer {
        field: Field,
        total_num_tokens: u64,
        fieldnorm_reader: Option<FieldNormReader>,
-    ) -> io::Result<FieldSerializer> {
+    ) -> io::Result<FieldSerializer<'_>> {
        let field_entry: &FieldEntry = self.schema.get_field_entry(field);
        let term_dictionary_write = self.terms_write.for_field(field);
        let postings_write = self.postings_write.for_field(field);
@@ -126,7 +126,7 @@ impl<'a> FieldSerializer<'a> {
        let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
        let average_fieldnorm = fieldnorm_reader
            .as_ref()
-            .map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
+            .map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
            .unwrap_or(0.0);
        let postings_serializer = PostingsSerializer::new(
            postings_write,
--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -1,5 +1,3 @@
-use serde::{Deserialize, Serialize};
-
 use crate::fieldnorm::FieldNormReader;
 use crate::query::Explanation;
 use crate::schema::Field;
@@ -68,12 +66,6 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
    cache
 }

-#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
-pub struct Bm25Params {
-    pub idf: Score,
-    pub avg_fieldnorm: Score,
-}
-
 /// A struct used for computing BM25 scores.
 #[derive(Clone)]
 pub struct Bm25Weight {
--- a/src/query/boolean_query/block_wand.rs
+++ b/src/query/boolean_query/block_wand.rs
@@ -167,7 +167,7 @@ pub fn block_wand(
        let block_max_score_upperbound: Score = scorers[..pivot_len]
            .iter_mut()
            .map(|scorer| {
-                scorer.shallow_seek(pivot_doc);
+                scorer.seek_block(pivot_doc);
                scorer.block_max_score()
            })
            .sum();
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
                return;
            }
            doc = last_doc_in_block + 1;
-            scorer.shallow_seek(doc);
+            scorer.seek_block(doc);
        }
        // Seek will effectively load that block.
        doc = scorer.seek(doc);
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
            }
        }
        doc += 1;
-        scorer.shallow_seek(doc);
+        scorer.seek_block(doc);
    }
 }

@@ -302,7 +302,6 @@ fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
 mod tests {
    use std::cmp::Ordering;
    use std::collections::BinaryHeap;
-    use std::iter;

    use proptest::prelude::*;

--- a/src/query/exist_query.rs
+++ b/src/query/exist_query.rs
@@ -1,12 +1,15 @@
 use core::fmt::Debug;

 use columnar::{ColumnIndex, DynamicColumn};
+use common::BitSet;

 use super::{ConstScorer, EmptyScorer};
 use crate::docset::{DocSet, TERMINATED};
 use crate::index::SegmentReader;
+use crate::query::all_query::AllScorer;
+use crate::query::boost_query::BoostScorer;
 use crate::query::explanation::does_not_match;
-use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
+use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
 use crate::schema::Type;
 use crate::{DocId, Score, TantivyError};

@@ -113,13 +116,49 @@ impl Weight for ExistsWeight {
                non_empty_columns.push(column)
            }
        }
-        // TODO: we can optimizer more here since in most cases we will have only one index
-        if !non_empty_columns.is_empty() {
-            let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
-            Ok(Box::new(ConstScorer::new(docset, boost)))
-        } else {
-            Ok(Box::new(EmptyScorer))
+        if non_empty_columns.is_empty() {
+            return Ok(Box::new(EmptyScorer));
        }
+
+        // If any column is full, all docs match.
+        let max_doc = reader.max_doc();
+        if non_empty_columns
+            .iter()
+            .any(|col| matches!(col.column_index(), ColumnIndex::Full))
+        {
+            let all_scorer = AllScorer::new(max_doc);
+            return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
+        }
+
+        // If we have a single dynamic column, use ExistsDocSet
+        // NOTE: A lower number may be better for very sparse columns
+        if non_empty_columns.len() < 4 {
+            let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
+            return Ok(Box::new(ConstScorer::new(docset, boost)));
+        }
+
+        // If we have many dynamic columns, precompute a bitset of matching docs
+        let mut doc_bitset = BitSet::with_max_value(max_doc);
+        for column in &non_empty_columns {
+            match column.column_index() {
+                ColumnIndex::Empty { .. } => {}
+                ColumnIndex::Full => {
+                    // Handled by AllScorer return above.
+                }
+                ColumnIndex::Optional(optional_index) => {
+                    for doc in optional_index.iter_non_null_docs() {
+                        doc_bitset.insert(doc);
+                    }
+                }
+                ColumnIndex::Multivalued(multi_idx) => {
+                    for doc in multi_idx.iter_non_null_docs() {
+                        doc_bitset.insert(doc);
+                    }
+                }
+            }
+        }
+        let docset = BitSetDocSet::from(doc_bitset);
+        Ok(Box::new(ConstScorer::new(docset, boost)))
    }

    fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -294,6 +333,43 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
+        // Build docs where no single subpath exists for all docs, but the union does.
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", TEXT | FAST);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        {
+            let mut index_writer = index.writer_for_tests()?;
+            for i in 0u64..100u64 {
+                if i % 2 == 0 {
+                    // only subpath `a`
+                    index_writer.add_document(doc!(json => json!({"a": i})))?;
+                } else {
+                    // only subpath `b`
+                    index_writer.add_document(doc!(json => json!({"b": i})))?;
+                }
+            }
+            index_writer.commit()?;
+        }
+        let reader = index.reader()?;
+        let searcher = reader.searcher();
+
+        // No single subpath is full
+        assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
+        assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
+
+        // Root exists with subpaths disabled is zero
+        assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
+
+        // Root exists with subpaths enabled should match all docs via union
+        assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
+
+        Ok(())
+    }
+
    #[test]
    fn test_exists_query_misc_supported_types() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -104,7 +104,7 @@ mod tests {
            let query = query_parser.parse_query("a a a a a").unwrap();
            let mut terms = Vec::new();
            query.query_terms(&mut |term, pos| terms.push((term, pos)));
-            assert_eq!(vec![(&term_a, false); 5], terms);
+            assert_eq!(vec![(&term_a, false); 1], terms);
        }
        {
            let query = query_parser.parse_query("a -b").unwrap();
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -1,8 +1,11 @@
 use std::fmt;
 use std::ops::Bound;
+use std::sync::Arc;
+
+use tantivy_fst::Regex;

 use crate::query::Occur;
-use crate::schema::Term;
+use crate::schema::{Field, Term};
 use crate::Score;

 #[derive(Clone)]
@@ -21,6 +24,10 @@ pub enum LogicalLiteral {
        elements: Vec<Term>,
    },
    All,
+    Regex {
+        pattern: Arc<Regex>,
+        field: Field,
+    },
 }

 pub enum LogicalAst {
@@ -38,6 +45,7 @@ impl LogicalAst {
        }
    }

+    // TODO: Move to rewrite_ast in query_grammar
    pub fn simplify(self) -> LogicalAst {
        match self {
            LogicalAst::Clause(clauses) => {
@@ -147,6 +155,10 @@ impl fmt::Debug for LogicalLiteral {
                write!(formatter, "]")
            }
            LogicalLiteral::All => write!(formatter, "*"),
+            LogicalLiteral::Regex {
+                ref pattern,
+                ref field,
+            } => write!(formatter, "Regex({field:?}, {pattern:?})"),
        }
    }
 }
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -2,12 +2,14 @@ use std::net::{AddrParseError, IpAddr};
 use std::num::{ParseFloatError, ParseIntError};
 use std::ops::Bound;
 use std::str::{FromStr, ParseBoolError};
+use std::sync::Arc;

 use base64::engine::general_purpose::STANDARD as BASE64;
 use base64::Engine;
 use itertools::Itertools;
 use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
 use rustc_hash::FxHashMap;
+use tantivy_fst::Regex;

 use super::logical_ast::*;
 use crate::index::Index;
@@ -15,7 +17,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
 use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
 use crate::query::{
    AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
-    PhraseQuery, Query, TermQuery, TermSetQuery,
+    PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
 };
 use crate::schema::{
    Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -206,6 +208,7 @@ pub struct QueryParser {
    tokenizer_manager: TokenizerManager,
    boost: FxHashMap<Field, Score>,
    fuzzy: FxHashMap<Field, Fuzzy>,
+    regexes_allowed: bool,
 }

 #[derive(Clone)]
@@ -260,6 +263,7 @@ impl QueryParser {
            conjunction_by_default: false,
            boost: Default::default(),
            fuzzy: Default::default(),
+            regexes_allowed: false,
        }
    }

@@ -320,6 +324,11 @@ impl QueryParser {
        );
    }

+    /// Allow regexes in queries
+    pub fn allow_regexes(&mut self) {
+        self.regexes_allowed = true;
+    }
+
    /// Parse a query
    ///
    /// Note that `parse_query` returns an error if the input
@@ -486,24 +495,17 @@ impl QueryParser {
                Ok(terms.into_iter().next().unwrap())
            }
            FieldType::JsonObject(ref json_options) => {
-                let get_term_with_path = || {
-                    Term::from_field_json_path(
-                        field,
-                        json_path,
-                        json_options.is_expand_dots_enabled(),
-                    )
-                };
+                let mut term = Term::from_field_json_path(
+                    field,
+                    json_path,
+                    json_options.is_expand_dots_enabled(),
+                );
                if let Some(term) =
                    // Try to convert the phrase to a fast value
-                    convert_to_fast_value_and_append_to_json_term(
-                        get_term_with_path(),
-                        phrase,
-                        false,
-                    )
+                    convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
                {
                    Ok(term)
                } else {
-                    let mut term = get_term_with_path();
                    term.append_type_and_str(phrase);
                    Ok(term)
                }
@@ -670,7 +672,7 @@ impl QueryParser {
            }
            UserInputAst::Boost(ast, boost) => {
                let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast);
-                (ast.boost(boost as Score), errors)
+                (ast.boost(boost.into_inner() as Score), errors)
            }
            UserInputAst::Leaf(leaf) => {
                let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf);
@@ -860,6 +862,51 @@ impl QueryParser {
                    "Range query need to target a specific field.".to_string(),
                )],
            ),
+            UserInputLeaf::Regex { field, pattern } => {
+                if !self.regexes_allowed {
+                    return (
+                        None,
+                        vec![QueryParserError::UnsupportedQuery(
+                            "Regex queries are not allowed.".to_string(),
+                        )],
+                    );
+                }
+                let full_path = try_tuple!(field.ok_or_else(|| {
+                    QueryParserError::UnsupportedQuery(
+                        "Regex query need to target a specific field.".to_string(),
+                    )
+                }));
+                let (field, json_path) = try_tuple!(self
+                    .split_full_path(&full_path)
+                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
+                if !json_path.is_empty() {
+                    return (
+                        None,
+                        vec![QueryParserError::UnsupportedQuery(
+                            "Regex query does not support json paths.".to_string(),
+                        )],
+                    );
+                }
+                if !matches!(
+                    self.schema.get_field_entry(field).field_type(),
+                    FieldType::Str(_)
+                ) {
+                    return (
+                        None,
+                        vec![QueryParserError::UnsupportedQuery(
+                            "Regex query only supported on text fields".to_string(),
+                        )],
+                    );
+                }
+                let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
+                    QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
+                }));
+                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
+                    pattern: Arc::new(pattern),
+                    field,
+                }));
+                (Some(logical_ast), Vec::new())
+            }
        }
    }
 }
@@ -902,6 +949,9 @@ fn convert_literal_to_query(
        LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
        LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
        LogicalLiteral::All => Box::new(AllQuery),
+        LogicalLiteral::Regex { pattern, field } => {
+            Box::new(RegexQuery::from_regex(pattern, field))
+        }
    }
 }

@@ -971,7 +1021,7 @@ fn generate_literals_for_json_object(

    // Try to convert the phrase to a fast value
    if let Some(term) =
-        convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
+        convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
    {
        logical_literals.push(LogicalLiteral::Term(term));
    }
@@ -1100,11 +1150,15 @@ mod test {
        query: &str,
        default_conjunction: bool,
        default_fields: &[&'static str],
+        allow_regexes: bool,
    ) -> Result<LogicalAst, QueryParserError> {
        let mut query_parser = make_query_parser_with_default_fields(default_fields);
        if default_conjunction {
            query_parser.set_conjunction_by_default();
        }
+        if allow_regexes {
+            query_parser.allow_regexes();
+        }
        query_parser.parse_query_to_logical_ast(query)
    }

@@ -1116,6 +1170,7 @@ mod test {
            query,
            default_conjunction,
            &["title", "text"],
+            true,
        )
    }

@@ -1130,6 +1185,7 @@ mod test {
            query,
            default_conjunction,
            default_fields,
+            true,
        )
        .unwrap();
        let query_str = format!("{query:?}");
@@ -1993,4 +2049,66 @@ mod test {
            Err(QueryParserError::ExpectedInt(_))
        );
    }
+
+    #[test]
+    pub fn test_deduplication() {
+        let query = "be be";
+        test_parse_query_to_logical_ast_helper(
+            query,
+            "(Term(field=0, type=Str, \"be\") Term(field=1, type=Str, \"be\"))",
+            false,
+        );
+    }
+
+    #[test]
+    pub fn test_regex() {
+        let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
+        test_parse_query_to_logical_ast_helper(
+            "title:/.*b/",
+            format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
+            false,
+        );
+
+        // Invalid field
+        let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Unsupported query: Regex query only supported on text fields"
+        );
+
+        // No field specified
+        let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Unsupported query: Regex query need to target a specific field."
+        );
+
+        // Regex on a json path
+        let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Unsupported query: Regex query does not support json paths."
+        );
+
+        // Invalid regex
+        let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Unsupported query: Invalid regex: regex parse error:\n    [A-Z*b\n    ^\nerror: \
+             unclosed character class"
+        );
+
+        // Regexes not allowed
+        let err = parse_query_to_logical_ast_with_default_fields(
+            "title:/.*b/",
+            false,
+            &["title", "text"],
+            false,
+        )
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Unsupported query: Regex queries are not allowed."
+        );
+    }
 }
--- a/src/query/range_query/mod.rs
+++ b/src/query/range_query/mod.rs
@@ -12,10 +12,14 @@ pub use self::range_query_fastfield::*;
 // TODO is this correct?
 pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
    match typ {
-        Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
-            true
-        }
-        Type::IpAddr => true,
+        Type::Str
+        | Type::U64
+        | Type::I64
+        | Type::F64
+        | Type::Bool
+        | Type::Date
+        | Type::Json
+        | Type::IpAddr => true,
        Type::Facet | Type::Bytes => false,
    }
 }
--- a/src/query/range_query/range_query_fastfield.rs
+++ b/src/query/range_query/range_query_fastfield.rs
@@ -258,7 +258,7 @@ fn search_on_json_numerical_field(

    let bounds = match typ.numerical_type().unwrap() {
        NumericalType::I64 => {
-            let bounds = bounds.map_bound(|term| (term.as_i64().unwrap()));
+            let bounds = bounds.map_bound(|term| term.as_i64().unwrap());
            match actual_column_type {
                NumericalType::I64 => bounds.map_bound(|&term| term.to_u64()),
                NumericalType::U64 => {
@@ -282,7 +282,7 @@ fn search_on_json_numerical_field(
            }
        }
        NumericalType::U64 => {
-            let bounds = bounds.map_bound(|term| (term.as_u64().unwrap()));
+            let bounds = bounds.map_bound(|term| term.as_u64().unwrap());
            match actual_column_type {
                NumericalType::U64 => bounds.map_bound(|&term| term.to_u64()),
                NumericalType::I64 => {
@@ -306,7 +306,7 @@ fn search_on_json_numerical_field(
            }
        }
        NumericalType::F64 => {
-            let bounds = bounds.map_bound(|term| (term.as_f64().unwrap()));
+            let bounds = bounds.map_bound(|term| term.as_f64().unwrap());
            match actual_column_type {
                NumericalType::U64 => transform_from_f64_bounds::<u64>(&bounds),
                NumericalType::I64 => transform_from_f64_bounds::<i64>(&bounds),
--- a/src/query/term_query/mod.rs
+++ b/src/query/term_query/mod.rs
@@ -11,7 +11,7 @@ mod tests {
    use crate::docset::DocSet;
    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
    use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
-    use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
+    use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
    use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};

    #[test]
@@ -212,4 +212,232 @@ mod tests {
        }
        Ok(())
    }
+
+    #[test]
+    fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
+        use crate::collector::Count;
+        use crate::schema::FAST;
+
+        // Create a FAST-only numeric field (not indexed)
+        let mut schema_builder = Schema::builder();
+        let num_field = schema_builder.add_u64_field("num", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+
+        {
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.add_document(doc!(num_field => 10u64))?;
+            index_writer.add_document(doc!(num_field => 20u64))?;
+            index_writer.add_document(doc!(num_field => 10u64))?;
+            index_writer.commit()?;
+        }
+
+        let reader = index.reader()?;
+        let searcher = reader.searcher();
+
+        // TermQuery should fall back to a fastfield range query and match correctly.
+        let tq_10 = TermQuery::new(
+            Term::from_field_u64(num_field, 10u64),
+            IndexRecordOption::Basic,
+        );
+        let tq_20 = TermQuery::new(
+            Term::from_field_u64(num_field, 20u64),
+            IndexRecordOption::Basic,
+        );
+        let tq_30 = TermQuery::new(
+            Term::from_field_u64(num_field, 30u64),
+            IndexRecordOption::Basic,
+        );
+
+        let count_10 = searcher.search(&tq_10, &Count)?;
+        let count_20 = searcher.search(&tq_20, &Count)?;
+        let count_30 = searcher.search(&tq_30, &Count)?;
+
+        assert_eq!(count_10, 2);
+        assert_eq!(count_20, 1);
+        assert_eq!(count_30, 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
+        use crate::collector::Count;
+
+        // FAST-only text field (not indexed)
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+
+        {
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.add_document(doc!(text_field => "hello"))?;
+            index_writer.add_document(doc!(text_field => "world"))?;
+            index_writer.add_document(doc!(text_field => "hello"))?;
+            index_writer.commit()?;
+        }
+
+        let searcher = index.reader()?.searcher();
+        let tq_hello = TermQuery::new(
+            Term::from_field_text(text_field, "hello"),
+            IndexRecordOption::Basic,
+        );
+        let tq_world = TermQuery::new(
+            Term::from_field_text(text_field, "world"),
+            IndexRecordOption::Basic,
+        );
+        let tq_missing = TermQuery::new(
+            Term::from_field_text(text_field, "nope"),
+            IndexRecordOption::Basic,
+        );
+
+        assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
+        assert_eq!(searcher.search(&tq_world, &Count)?, 1);
+        assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
+        use crate::collector::Count;
+        use crate::fastfield::FastValue;
+        use crate::schema::FAST;
+
+        let mut schema_builder = Schema::builder();
+        let json_field = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+
+        {
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
+            index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
+            index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
+            index_writer.commit()?;
+        }
+
+        fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
+            let mut term = Term::from_field_json_path(field, path, true);
+            term.append_type_and_fast_value(v);
+            term
+        }
+        fn json_term_str(field: Field, path: &str, v: &str) -> Term {
+            let mut term = Term::from_field_json_path(field, path, true);
+            term.append_type_and_str(v);
+            term
+        }
+
+        let searcher = index.reader()?.searcher();
+        // numeric path match
+        let tq_a10 = TermQuery::new(
+            json_term_fast(json_field, "a", 10u64),
+            IndexRecordOption::Basic,
+        );
+        let tq_a20 = TermQuery::new(
+            json_term_fast(json_field, "a", 20u64),
+            IndexRecordOption::Basic,
+        );
+        let tq_a30 = TermQuery::new(
+            json_term_fast(json_field, "a", 30u64),
+            IndexRecordOption::Basic,
+        );
+        assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
+        assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
+        assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
+
+        // string path match
+        let tq_bx = TermQuery::new(
+            json_term_str(json_field, "b", "x"),
+            IndexRecordOption::Basic,
+        );
+        let tq_by = TermQuery::new(
+            json_term_str(json_field, "b", "y"),
+            IndexRecordOption::Basic,
+        );
+        let tq_bm = TermQuery::new(
+            json_term_str(json_field, "b", "missing"),
+            IndexRecordOption::Basic,
+        );
+        assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
+        assert_eq!(searcher.search(&tq_by, &Count)?, 1);
+        assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
+        Ok(())
+    }
+
+    #[test]
+    fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
+        use std::net::IpAddr;
+        use std::str::FromStr;
+
+        use crate::collector::Count;
+        use crate::schema::{IntoIpv6Addr, FAST};
+
+        let mut schema_builder = Schema::builder();
+        let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+
+        let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
+        let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
+        {
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.add_document(doc!(ip_field => ip1))?;
+            index_writer.add_document(doc!(ip_field => ip2))?;
+            index_writer.add_document(doc!(ip_field => ip1))?;
+            index_writer.commit()?;
+        }
+
+        let searcher = index.reader()?.searcher();
+        let tq_ip1 = TermQuery::new(
+            Term::from_field_ip_addr(ip_field, ip1),
+            IndexRecordOption::Basic,
+        );
+        let tq_ip2 = TermQuery::new(
+            Term::from_field_ip_addr(ip_field, ip2),
+            IndexRecordOption::Basic,
+        );
+        let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
+        let tq_ip3 = TermQuery::new(
+            Term::from_field_ip_addr(ip_field, ip3),
+            IndexRecordOption::Basic,
+        );
+
+        assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
+        assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
+        assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
+        Ok(())
+    }
+
+    #[test]
+    fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
+        use crate::collector::TopDocs;
+
+        // FAST-only numeric field (not indexed) should error when scoring is required
+        let mut schema_builder = Schema::builder();
+        let num_field = schema_builder.add_u64_field("num", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+
+        {
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.add_document(doc!(num_field => 10u64))?;
+            index_writer.add_document(doc!(num_field => 20u64))?;
+            index_writer.commit()?;
+        }
+
+        let searcher = index.reader()?.searcher();
+        let tq = TermQuery::new(
+            Term::from_field_u64(num_field, 10u64),
+            IndexRecordOption::Basic,
+        );
+
+        // Using TopDocs requires scoring; since the field is not indexed,
+        // TermQuery cannot score and should return a SchemaError.
+        let res = searcher.search(&tq, &TopDocs::with_limit(1));
+        assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
+
+        Ok(())
+    }
 }
--- a/src/query/term_query/term_query.rs
+++ b/src/query/term_query/term_query.rs
@@ -1,8 +1,10 @@
 use std::fmt;
+use std::ops::Bound;

 use super::term_weight::TermWeight;
 use crate::query::bm25::Bm25Weight;
-use crate::query::{EnableScoring, Explanation, Query, Weight};
+use crate::query::range_query::is_type_valid_for_fastfield_range_query;
+use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
 use crate::schema::IndexRecordOption;
 use crate::Term;

@@ -99,7 +101,7 @@ impl TermQuery {
            EnableScoring::Enabled {
                statistics_provider,
                ..
-            } => Bm25Weight::for_terms(statistics_provider, &[self.term.clone()])?,
+            } => Bm25Weight::for_terms(statistics_provider, std::slice::from_ref(&self.term))?,
            EnableScoring::Disabled { .. } => {
                Bm25Weight::new(Explanation::new("<no score>", 1.0f32), 1.0f32)
            }
@@ -122,6 +124,24 @@ impl TermQuery {

 impl Query for TermQuery {
    fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
+        // If the field is not indexed but is a suitable fast field, fall back to a range query
+        // on the fast field matching exactly this term.
+        //
+        // Note: This is considerable slower since it requires to scan the entire fast field.
+        // TODO: The range query would gain from having a single-value optimization
+        let schema = enable_scoring.schema();
+        let field_entry = schema.get_field_entry(self.term.field());
+        if !field_entry.is_indexed()
+            && field_entry.is_fast()
+            && is_type_valid_for_fastfield_range_query(self.term.typ())
+            && !enable_scoring.is_scoring_enabled()
+        {
+            let range_query = RangeQuery::new(
+                Bound::Included(self.term.clone()),
+                Bound::Included(self.term.clone()),
+            );
+            return range_query.weight(enable_scoring);
+        }
        Ok(Box::new(self.specialized_weight(enable_scoring)?))
    }
    fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
--- a/src/query/term_query/term_scorer.rs
+++ b/src/query/term_query/term_scorer.rs
@@ -25,8 +25,8 @@ impl TermScorer {
        }
    }

-    pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
-        self.postings.block_cursor.shallow_seek(target_doc);
+    pub(crate) fn seek_block(&mut self, target_doc: DocId) {
+        self.postings.block_cursor.seek_block(target_doc);
    }

    #[cfg(test)]
@@ -175,7 +175,7 @@ mod tests {
        let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
        let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
        assert_eq!(term_scorer.doc(), 0u32);
-        term_scorer.shallow_seek(1289);
+        term_scorer.seek_block(1289);
        assert_eq!(term_scorer.doc(), 0u32);
        term_scorer.seek(1289);
        assert_eq!(term_scorer.doc(), 1290);
@@ -242,9 +242,9 @@ mod tests {
        let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
        let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
        assert_nearly_equals!(docs.block_max_score(), 2.5161593);
-        docs.shallow_seek(135);
+        docs.seek_block(135);
        assert_nearly_equals!(docs.block_max_score(), 3.4597192);
-        docs.shallow_seek(256);
+        docs.seek_block(256);
        // the block is not loaded yet.
        assert_nearly_equals!(docs.block_max_score(), 5.2971773);
        assert_eq!(256, docs.seek(256));
@@ -275,7 +275,7 @@ mod tests {
            {
                let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
                for d in docs {
-                    term_scorer.shallow_seek(d);
+                    term_scorer.seek_block(d);
                    block_max_scores_b.push(term_scorer.block_max_score());
                }
            }
--- a/src/query/union/buffered_union.rs
+++ b/src/query/union/buffered_union.rs
@@ -5,8 +5,10 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
 use crate::query::Scorer;
 use crate::{DocId, Score};

-const HORIZON_NUM_TINYBITSETS: usize = 64;
-const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
+// The buffered union looks ahead within a fixed-size sliding window
+// of upcoming document IDs (the "horizon").
+const HORIZON_NUM_TINYBITSETS: usize = HORIZON as usize / 64;
+const HORIZON: u32 = 64u32 * 64u32;

 // `drain_filter` is not stable yet.
 // This function is similar except that it does is not unstable, and
@@ -27,12 +29,26 @@ where P: FnMut(&mut T) -> bool {

 /// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
 pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
+    /// Active scorers (already filtered of `TERMINATED`).
    docsets: Vec<TScorer>,
+    /// Sliding window presence map for upcoming docs.
+    ///
+    /// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering
+    /// a span of 64 doc IDs. Bucket `i` represents the range
+    /// `[window_start_doc + i*64, window_start_doc + (i+1)*64)`.
    bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
+    // Index of the current TinySet bucket within the sliding window.
+    bucket_idx: usize,
+    /// Per-doc score combiners for the current window.
+    ///
+    /// these accumulators merge contributions from all scorers that
+    /// hit the same doc within the buffered window.
    scores: Box<[TScoreCombiner; HORIZON as usize]>,
-    cursor: usize,
-    offset: DocId,
+    /// Start doc ID (inclusive) of the current sliding window.
+    window_start_doc: DocId,
+    /// Current doc ID of the union.
    doc: DocId,
+    /// Combined score for current `doc` as produced by `TScoreCombiner`.
    score: Score,
 }

@@ -74,8 +90,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
            docsets: non_empty_docsets,
            bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
            scores: Box::new([score_combiner_fn(); HORIZON as usize]),
-            cursor: HORIZON_NUM_TINYBITSETS,
-            offset: 0,
+            bucket_idx: HORIZON_NUM_TINYBITSETS,
+            window_start_doc: 0,
            doc: 0,
            score: 0.0,
        };
@@ -89,8 +105,10 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer

    fn refill(&mut self) -> bool {
        if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
-            self.offset = min_doc;
-            self.cursor = 0;
+            // Reset the sliding window to start at the smallest doc
+            // across all scorers and prebuffer within the horizon.
+            self.window_start_doc = min_doc;
+            self.bucket_idx = 0;
            self.doc = min_doc;
            refill(
                &mut self.docsets,
@@ -105,16 +123,16 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
    }

    fn advance_buffered(&mut self) -> bool {
-        while self.cursor < HORIZON_NUM_TINYBITSETS {
-            if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
-                let delta = val + (self.cursor as u32) * 64;
-                self.doc = self.offset + delta;
+        while self.bucket_idx < HORIZON_NUM_TINYBITSETS {
+            if let Some(val) = self.bitsets[self.bucket_idx].pop_lowest() {
+                let delta = val + (self.bucket_idx as u32) * 64;
+                self.doc = self.window_start_doc + delta;
                let score_combiner = &mut self.scores[delta as usize];
                self.score = score_combiner.score();
                score_combiner.clear();
                return true;
            } else {
-                self.cursor += 1;
+                self.bucket_idx += 1;
            }
        }
        false
@@ -144,19 +162,19 @@ where
        if self.doc >= target {
            return self.doc;
        }
-        let gap = target - self.offset;
+        let gap = target - self.window_start_doc;
        if gap < HORIZON {
            // Our value is within the buffered horizon.

-            // Skipping to  corresponding bucket.
-            let new_cursor = gap as usize / 64;
-            for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
+            // Skipping to corresponding bucket.
+            let new_bucket_idx = gap as usize / 64;
+            for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] {
                obsolete_tinyset.clear();
            }
-            for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
+            for score_combiner in &mut self.scores[self.bucket_idx * 64..new_bucket_idx * 64] {
                score_combiner.clear();
            }
-            self.cursor = new_cursor;
+            self.bucket_idx = new_bucket_idx;

            // Advancing until we reach the end of the bucket
            // or we reach a doc greater or equal to the target.
@@ -211,7 +229,7 @@ where
        if self.doc == TERMINATED {
            return 0;
        }
-        let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
+        let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
            .iter()
            .map(|bitset| bitset.len())
            .sum::<u32>()
@@ -225,7 +243,7 @@ where
                bitset.clear();
            }
        }
-        self.cursor = HORIZON_NUM_TINYBITSETS;
+        self.bucket_idx = HORIZON_NUM_TINYBITSETS;
        count
    }
 }
--- a/src/schema/document/mod.rs
+++ b/src/schema/document/mod.rs
@@ -80,6 +80,7 @@
 //! }
 //!
 //! /// Our custom iterator just helps us to avoid some messy generics.
+//! #[allow(dead_code)]
 //! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
 //! impl<'a> Iterator for MyCustomIter<'a> {
 //!     // Here we can see our field-value pairs being produced by the iterator.
--- a/src/tokenizer/ascii_folding_filter.rs
+++ b/src/tokenizer/ascii_folding_filter.rs
@@ -1561,7 +1561,6 @@ fn to_ascii(text: &str, output: &mut String) {

 #[cfg(test)]
 mod tests {
-    use std::iter;

    use super::to_ascii;
    use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};
--- a/sstable/src/dictionary.rs
+++ b/sstable/src/dictionary.rs
@@ -308,10 +308,9 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
                }
            }
            _ => {
-                return Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    format!("Unsupported sstable version, expected one of [2, 3], found {version}"),
-                ));
+                return Err(io::Error::other(format!(
+                    "Unsupported sstable version, expected one of [2, 3], found {version}"
+                )));
            }
        };

@@ -609,12 +608,12 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {

    /// Returns a range builder, to stream all of the terms
    /// within an interval.
-    pub fn range(&self) -> StreamerBuilder<TSSTable> {
+    pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
        StreamerBuilder::new(self, AlwaysMatch)
    }

    /// Returns a range builder filtered with a prefix.
-    pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<TSSTable> {
+    pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<'_, TSSTable> {
        let lower_bound = prefix.as_ref();
        let mut upper_bound = lower_bound.to_vec();
        for idx in (0..upper_bound.len()).rev() {
@@ -633,7 +632,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    }

    /// A stream of all the sorted terms.
-    pub fn stream(&self) -> io::Result<Streamer<TSSTable>> {
+    pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
        self.range().into_stream()
    }

--- a/sstable/src/merge/heap_merge.rs
+++ b/sstable/src/merge/heap_merge.rs
@@ -54,14 +54,14 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
            }
        }
        for _ in 0..len - 1 {
-            if let Some(mut head) = heap.peek_mut() {
-                if head.0.key() == writer.last_inserted_key() {
-                    value_merger.add(head.0.value());
-                    if !head.0.advance()? {
-                        PeekMut::pop(head);
-                    }
-                    continue;
+            if let Some(mut head) = heap.peek_mut()
+                && head.0.key() == writer.last_inserted_key()
+            {
+                value_merger.add(head.0.value());
+                if !head.0.advance()? {
+                    PeekMut::pop(head);
                }
+                continue;
            }
            break;
        }
--- a/sstable/src/sstable_index_v3.rs
+++ b/sstable/src/sstable_index_v3.rs
@@ -394,7 +394,7 @@ impl SSTableIndexBuilder {

 fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
    match error {
-        tantivy_fst::Error::Fst(fst_error) => io::Error::new(io::ErrorKind::Other, fst_error),
+        tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
        tantivy_fst::Error::Io(ioerror) => ioerror,
    }
 }
@@ -438,7 +438,7 @@ impl BlockAddrBlockMetadata {
        let ordinal_addr = range_start_addr + self.range_start_nbits as usize;
        let range_end_addr = range_start_addr + num_bits;

-        if (range_end_addr + self.range_start_nbits as usize + 7) / 8 > data.len() {
+        if (range_end_addr + self.range_start_nbits as usize).div_ceil(8) > data.len() {
            return None;
        }

--- a/stacker/src/shared_arena_hashmap.rs
+++ b/stacker/src/shared_arena_hashmap.rs
@@ -274,13 +274,12 @@ impl SharedArenaHashMap {
            let kv: KeyValue = self.table[bucket];
            if kv.is_empty() {
                return None;
-            } else if kv.hash == hash {
-                if let Some(val_addr) =
+            } else if kv.hash == hash
+                && let Some(val_addr) =
                    self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
-                {
-                    let v = memory_arena.read(val_addr);
-                    return Some(v);
-                }
+            {
+                let v = memory_arena.read(val_addr);
+                return Some(v);
            }
        }
    }
@@ -334,15 +333,14 @@ impl SharedArenaHashMap {
                self.set_bucket(hash, key_addr, bucket);
                return val;
            }
-            if kv.hash == hash {
-                if let Some(val_addr) =
+            if kv.hash == hash
+                && let Some(val_addr) =
                    self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
-                {
-                    let v = memory_arena.read(val_addr);
-                    let new_v = updater(Some(v));
-                    memory_arena.write_at(val_addr, new_v);
-                    return new_v;
-                }
+            {
+                let v = memory_arena.read(val_addr);
+                let new_v = updater(Some(v));
+                memory_arena.write_at(val_addr, new_v);
+                return new_v;
            }
            // This allows fetching the next bucket before the loop jmp
            bucket = probe.next_probe();
Author	SHA1	Message	Date
Paul Masurel	ebb82dc549	clippy	2025-10-08 17:07:07 +02:00
PSeitz	270ca5123c	refactor postings (#2709 ) rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings	2025-10-08 16:55:25 +02:00
Mustafa S. Moiz	714366d3b9	docs: correct grammar (#2704 ) Correct phrasing for a single line in the docs (`one documents` -> `a document`).	2025-10-08 16:47:09 +02:00
PSeitz-dd	40659d4d07	improve naming in buffered_union (#2705 )	2025-09-24 10:58:46 +02:00
PSeitz	e1e131a804	add and/or queries benchmark (#2701 )	2025-09-22 16:32:49 +02:00
PSeitz-dd	70da310b2d	perf: deduplicate queries (#2698 ) * deduplicate queries Deduplicate queries in the UserInputAst after parsing queries * add return type	2025-09-22 12:16:58 +02:00
PSeitz	85010b589a	clippy (#2700 ) * clippy * clippy * clippy * clippy + fmt --------- Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-09-19 18:04:25 +02:00
PSeitz-dd	2340dca628	fix compiler warnings (#2699 ) * fix compiler warnings * fix import	2025-09-19 15:55:04 +02:00
Remi	71a26d5b24	Fix CI with rust 1.90 (#2696 ) * Empty commit * Fix dead code lint error	2025-09-18 23:06:33 +02:00
PSeitz-dd	203751f2fe	Optimize ExistsQuery for a high number of dynamic columns (#2694 ) * Optimize ExistsQuery for a high number of dynamic columns The previous algorithm checked _each_ doc in _each_ column for existence. This causes huge cost on JSON fields with e.g. 100k columns. Compute a bitset instead if we have more than one column. add `iter_docs` to the multivalued_index * add benchmark subfields=1 exists_json_union Memory: 89.3 KB (+2.01%) Avg: 0.4865ms (-26.03%) Median: 0.4865ms (-26.03%) [0.4865ms .. 0.4865ms] subfields=2 exists_json_union Memory: 68.1 KB Avg: 1.7048ms (-0.46%) Median: 1.7048ms (-0.46%) [1.7048ms .. 1.7048ms] subfields=3 exists_json_union Memory: 61.8 KB Avg: 2.0742ms (-2.22%) Median: 2.0742ms (-2.22%) [2.0742ms .. 2.0742ms] subfields=4 exists_json_union Memory: 119.8 KB (+103.44%) Avg: 3.9500ms (+42.62%) Median: 3.9500ms (+42.62%) [3.9500ms .. 3.9500ms] subfields=5 exists_json_union Memory: 120.4 KB (+107.65%) Avg: 3.9610ms (+20.65%) Median: 3.9610ms (+20.65%) [3.9610ms .. 3.9610ms] subfields=6 exists_json_union Memory: 120.6 KB (+107.49%) Avg: 3.8903ms (+3.11%) Median: 3.8903ms (+3.11%) [3.8903ms .. 3.8903ms] subfields=7 exists_json_union Memory: 120.9 KB (+106.93%) Avg: 3.6220ms (-16.22%) Median: 3.6220ms (-16.22%) [3.6220ms .. 3.6220ms] subfields=8 exists_json_union Memory: 121.3 KB (+106.23%) Avg: 4.0981ms (-15.97%) Median: 4.0981ms (-15.97%) [4.0981ms .. 4.0981ms] subfields=16 exists_json_union Memory: 123.1 KB (+103.09%) Avg: 4.3483ms (-92.26%) Median: 4.3483ms (-92.26%) [4.3483ms .. 4.3483ms] subfields=256 exists_json_union Memory: 204.6 KB (+19.85%) Avg: 3.8874ms (-99.01%) Median: 3.8874ms (-99.01%) [3.8874ms .. 3.8874ms] subfields=4096 exists_json_union Memory: 2.0 MB Avg: 3.5571ms (-99.90%) Median: 3.5571ms (-99.90%) [3.5571ms .. 3.5571ms] subfields=65536 exists_json_union Memory: 28.3 MB Avg: 14.4417ms (-99.97%) Median: 14.4417ms (-99.97%) [14.4417ms .. 14.4417ms] subfields=262144 exists_json_union Memory: 113.3 MB Avg: 66.2860ms (-99.95%) Median: 66.2860ms (-99.95%) [66.2860ms .. 66.2860ms] * rename methods	2025-09-16 18:21:03 +02:00
PSeitz-dd	7963b0b4aa	Add fast field fallback for term query if not indexed (#2693 ) * Add fast field fallback for term query if not indexed * only fallback without scores	2025-09-12 14:58:21 +02:00
Paul Masurel	d5eefca11d	Merge pull request #2692 from quickwit-oss/paul.masurel/coerce-floats-too-in-search-too This PR changes the logic used on the ingestion of floats.	2025-09-10 09:46:54 +02:00
Paul Masurel	5d6c8de23e	Align search float search logic to the columnar coercion rules It applies the same logic on floats as for u64 or i64. In all case, the idea is (for the inverted index) to coerce number to their canonical representation, before indexing and before searching. That way a document with the float 1.0 will be searchable when the user searches for 1. Note that contrary to the columnar, we do not attempt to coerce all of the terms associated to a given json path to a single numerical type. We simply rely on this "point-wise" canonicalization.	2025-09-09 19:28:17 +02:00
PSeitz	a06365f39f	Update CHANGELOG.md for bugfixes (#2674 ) * Update CHANGELOG.md * Update CHANGELOG.md	2025-09-04 11:51:00 +02:00
Raphaël Cohen	f4b374110f	feat: Regex query grammar (#2677 ) * feat: Regex query grammar * feat: Disable regexes by default * chore: Apply formatting	2025-09-03 10:07:04 +02:00