Bump codecov/codecov-action from 3 to 5

Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 5. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v3...v5) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>
Update thiserror requirement from 1.0.30 to 2.0.1 (#2542 )
2026-01-09 10:32:55 +00:00 · 2024-11-14 20:12:50 +00:00 · 2024-11-09 08:08:34 +08:00 · 2024-11-01 13:46:26 +08:00 · 2024-10-24 09:41:35 +08:00 · 2024-10-23 21:06:54 +09:00
123 changed files with 1960 additions and 677 deletions
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -21,7 +21,7 @@ jobs:
      - name: Generate code coverage
        run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v5
        continue-on-error: true
        with:
          token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -46,7 +46,7 @@ The file of a segment has the format

 ```segment-id . ext```

-The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
+The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.

 A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.

@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.

 Tantivy's document follows a very strict schema, decided before building any index.

-The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
+The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.

 Depending on the type of the field, you can decide to

--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,10 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - alias: Quickwit Inc.
+    website: "https://quickwit.io"
+title: "tantivy"
+version: 0.22.0
+doi: 10.5281/zenodo.13942948
+date-released: 2024-10-17
+url: "https://github.com/quickwit-oss/tantivy"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 edition = "2021"
-rust-version = "1.66"
+rust-version = "1.75"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
@@ -43,8 +43,8 @@ bitpacking = { version = "0.9.2", default-features = false, features = [
    "bitpacker4x",
 ] }
 census = "0.4.2"
-rustc-hash = "1.1.0"
-thiserror = "1.0.30"
+rustc-hash = "2.0.0"
+thiserror = "2.0.1"
 htmlescape = "0.3.1"
 fail = { version = "0.5.0", optional = true }
 time = { version = "0.3.35", features = ["serde-well-known"] }
@@ -72,7 +72,7 @@ fnv = "1.0.7"
 winapi = "0.3.9"

 [dev-dependencies]
-binggan = "0.12.0"
+binggan = "0.14.0"
 rand = "0.8.5"
 maplit = "1.0.2"
 matches = "0.1.9"
--- a/benches/agg_bench.rs
+++ b/benches/agg_bench.rs
@@ -20,7 +20,6 @@ macro_rules! register {
    ($runner:expr, $func:ident) => {
        $runner.register(stringify!($func), move |index| {
            $func(index);
-            None
        })
    };
 }
--- a/bitpacker/src/filter_vec/mod.rs
+++ b/bitpacker/src/filter_vec/mod.rs
@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
 const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];

 impl FilterImplPerInstructionSet {
-    #[allow(unused_variables)]
    #[inline]
+    #[allow(unused_variables)] // on non-x86_64, code is unused.
    fn from(code: u8) -> FilterImplPerInstructionSet {
        #[cfg(target_arch = "x86_64")]
        if code == FilterImplPerInstructionSet::AVX2 as u8 {
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
 proptest = "1"
 more-asserts = "0.3.1"
 rand = "0.8"
-binggan = "0.12.0"
+binggan = "0.14.0"

 [[bench]]
 name = "bench_merge"
--- a/columnar/benches/bench_access.rs
+++ b/columnar/benches/bench_access.rs
@@ -42,7 +42,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
            }
        }
        black_box(sum);
-        None
    });
    runner.register("access_first_vals", |column| {
        let mut sum = 0;
@@ -63,7 +62,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
        }

        black_box(sum);
-        None
    });
    runner.run();
 }
--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -1,6 +1,6 @@
 pub mod common;

-use binggan::{black_box, BenchRunner};
+use binggan::BenchRunner;
 use common::{generate_columnar_with_name, Card};
 use tantivy_columnar::*;

@@ -29,7 +29,7 @@ fn main() {
    add_combo(Card::Multi, Card::Dense);
    add_combo(Card::Multi, Card::Sparse);

-    let runner: BenchRunner = BenchRunner::new();
+    let mut runner: BenchRunner = BenchRunner::new();
    let mut group = runner.new_group();
    for (input_name, columnar_readers) in inputs.iter() {
        group.register_with_input(
--- a/columnar/src/block_accessor.rs
+++ b/columnar/src/block_accessor.rs
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
        &'a self,
        docs: &'a [u32],
        accessor: &Column<T>,
-    ) -> impl Iterator<Item = (DocId, T)> + '_ {
+    ) -> impl Iterator<Item = (DocId, T)> + 'a {
        if accessor.index.get_cardinality().is_full() {
            docs.iter().cloned().zip(self.val_cache.iter().cloned())
        } else {
--- a/columnar/src/column_index/optional_index/set_block/sparse.rs
+++ b/columnar/src/column_index/optional_index/set_block/sparse.rs
@@ -82,7 +82,7 @@ impl<'a> SparseBlock<'a> {
    }

    #[inline]
-    #[allow(clippy::comparison_chain)]
+    #[expect(clippy::comparison_chain)]
    // Looks for the element in the block. Returns the positions if found.
    fn binary_search(&self, target: u16) -> Result<u16, u16> {
        let data = &self.0;
--- a/columnar/src/column_values/u128_based/mod.rs
+++ b/columnar/src/column_values/u128_based/mod.rs
@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {
    use super::*;
    use crate::column_values::u64_based::{
        serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
--- a/columnar/src/columnar/writer/column_operation.rs
+++ b/columnar/src/columnar/writer/column_operation.rs
@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
 // In order to limit memory usage, and in order
 // to benefit from the stacker, we do this by serialization our data
 // as "Symbols".
-#[allow(clippy::from_over_into)]
 pub(super) trait SymbolValue: Clone + Copy {
    // Serializes the symbol into the given buffer.
    // Returns the number of bytes written into the buffer.
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -392,7 +392,7 @@ impl ColumnarWriter {

 // Serialize [Dictionary, Column, dictionary num bytes U32::LE]
 // Column: [Column Index, Column Values, column index num bytes U32::LE]
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 fn serialize_bytes_or_str_column(
    cardinality: Cardinality,
    num_docs: RowId,
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -19,7 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
 serde = { version = "1.0.136", features = ["derive"] }

 [dev-dependencies]
-binggan = "0.12.0"
+binggan = "0.14.0"
 proptest = "1.0.0"
 rand = "0.8.4"

--- a/common/benches/bench.rs
+++ b/common/benches/bench.rs
@@ -15,7 +15,6 @@ fn bench_vint() {
            out += u64::from(buf[0]);
        }
        black_box(out);
-        None
    });

    let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
@@ -27,7 +26,6 @@ fn bench_vint() {
            out += u64::from(buf[0]);
        }
        black_box(out);
-        None
    });
 }

@@ -43,24 +41,20 @@ fn bench_bitset() {
        tinyset.pop_lowest();
        tinyset.pop_lowest();
        black_box(tinyset);
-        None
    });

    let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
    runner.bench_function("bench_tinyset_sum", move |_| {
        assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
-        None
    });

    let v = [10u32, 14u32, 21u32];
    runner.bench_function("bench_tinyarr_sum", move |_| {
        black_box(v.iter().cloned().sum::<u32>());
-        None
    });

    runner.bench_function("bench_bitset_initialize", move |_| {
        black_box(BitSet::with_max_value(1_000_000));
-        None
    });
 }

--- a/common/src/lib.rs
+++ b/common/src/lib.rs
@@ -130,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
 }

 #[cfg(test)]
-pub mod test {
+pub(crate) mod test {

    use proptest::prelude::*;

-    use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
+    use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};

    fn test_i64_converter_helper(val: i64) {
        assert_eq!(u64_to_i64(i64_to_u64(val)), val);
@@ -144,12 +144,6 @@ pub mod test {
        assert_eq!(u64_to_f64(f64_to_u64(val)), val);
    }

-    pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
-        let mut buffer = Vec::new();
-        O::default().serialize(&mut buffer).unwrap();
-        assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
-    }
-
    proptest! {
        #[test]
        fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
--- a/common/src/serialize.rs
+++ b/common/src/serialize.rs
@@ -74,14 +74,14 @@ impl FixedSize for () {

 impl<T: BinarySerializable> BinarySerializable for Vec<T> {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
-        VInt(self.len() as u64).serialize(writer)?;
+        BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
        for it in self {
            it.serialize(writer)?;
        }
        Ok(())
    }
    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
-        let num_items = VInt::deserialize(reader)?.val();
+        let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
        let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
        for _ in 0..num_items {
            let item = T::deserialize(reader)?;
@@ -236,12 +236,12 @@ impl FixedSize for bool {
 impl BinarySerializable for String {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        let data: &[u8] = self.as_bytes();
-        VInt(data.len() as u64).serialize(writer)?;
+        BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
        writer.write_all(data)
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
-        let string_length = VInt::deserialize(reader)?.val() as usize;
+        let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
        let mut result = String::with_capacity(string_length);
        reader
            .take(string_length as u64)
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
 impl<'a> BinarySerializable for Cow<'a, str> {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        let data: &[u8] = self.as_bytes();
-        VInt(data.len() as u64).serialize(writer)?;
+        BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
        writer.write_all(data)
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
-        let string_length = VInt::deserialize(reader)?.val() as usize;
+        let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
        let mut result = String::with_capacity(string_length);
        reader
            .take(string_length as u64)
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {

 impl<'a> BinarySerializable for Cow<'a, [u8]> {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
-        VInt(self.len() as u64).serialize(writer)?;
+        BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
        for it in self.iter() {
-            it.serialize(writer)?;
+            BinarySerializable::serialize(it, writer)?;
        }
        Ok(())
    }

    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
-        let num_items = VInt::deserialize(reader)?.val();
+        let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
        let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
        for _ in 0..num_items {
-            let item = u8::deserialize(reader)?;
+            let item = <u8 as BinarySerializable>::deserialize(reader)?;
            items.push(item);
        }
        Ok(Cow::Owned(items))
--- a/doc/src/avant-propos.md
+++ b/doc/src/avant-propos.md
@@ -2,7 +2,7 @@

 > Tantivy is a **search** engine **library** for Rust.

-If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
+If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
 they both have the same scope and targeted use cases.

 If you are not familiar with Lucene, let's break down our little tagline.
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
  experience. But keep in mind this is just a toolbox.
  Which bring us to the second keyword...

- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
+- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.

  Sometimes a functionality will not be available in tantivy because it is too
  specific to your use case. By design, tantivy should make it possible to extend
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
  index from a different format.

  Tantivy exposes a lot of low level API to do all of these things.
-  
+  
--- a/doc/src/basis.md
+++ b/doc/src/basis.md
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
 While this design has some downsides, this greatly simplifies the source code of
 tantivy. Caching is also entirely delegated to the OS.

-`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
+Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.

 This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.

--- a/doc/src/index_sorting.md
+++ b/doc/src/index_sorting.md
@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
 When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
 E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.

-Note: Tantivy 0.16 does not do this optimization yet.
+Note: tantivy 0.16 does not do this optimization yet.

 ### Pruning

 Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.

-Note: Tantivy 0.16 does not do this optimization yet.
+Note: tantivy 0.16 does not do this optimization yet.

 ### Other?

@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i

 ## Usage

-The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
+The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.

 ```rust
 let settings = IndexSettings {
--- a/doc/src/json.md
+++ b/doc/src/json.md
@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
 - `value`: The value representation is just the regular Value representation.

 This representation is designed to align the natural sort of Terms with the lexicographical sort
-of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
+of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).

 In the example above, the terms will be sorted as

--- a/ownedbytes/src/lib.rs
+++ b/ownedbytes/src/lib.rs
@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We truncate the bytes in order to make sure the debug string
        // is not too long.
-        let bytes_truncated: &[u8] = if self.len() > 8 {
+        let bytes_truncated: &[u8] = if self.len() > 10 {
            &self.as_slice()[..10]
        } else {
            self.as_slice()
@@ -252,6 +252,11 @@ mod tests {
            format!("{short_bytes:?}"),
            "OwnedBytes([97, 98, 99, 100], len=4)"
        );
+        let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
+        assert_eq!(
+            format!("{medium_bytes:?}"),
+            "OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
+        );
        let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
        assert_eq!(
            format!("{long_bytes:?}"),
--- a/query-grammar/src/infallible.rs
+++ b/query-grammar/src/infallible.rs
@@ -111,7 +111,6 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
        Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
        // old versions don't understand this is uninhabited and need the empty match to help,
        // newer versions warn because this arm is unreachable (which it is indeed).
-        #[allow(unreachable_patterns)]
        Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
    }
 }
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -767,7 +767,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
    tuple((fallible(occur_symbol), boosted_leaf))(inp)
 }

-#[allow(clippy::type_complexity)]
+#[expect(clippy::type_complexity)]
 fn operand_occur_leaf_infallible(
    inp: &str,
 ) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
--- a/src/aggregation/agg_result.rs
+++ b/src/aggregation/agg_result.rs
@@ -1,4 +1,5 @@
 //! Contains the final aggregation tree.
+//!
 //! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
 //! This conversion computes the final result. For example: The intermediate result contains
 //! intermediate average results, which is the sum and the number of values. The actual average is
@@ -187,7 +188,7 @@ pub enum BucketEntries<T> {
 }

 impl<T> BucketEntries<T> {
-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
+    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
        match self {
            BucketEntries::Vec(vec) => Box::new(vec.iter()),
            BucketEntries::HashMap(map) => Box::new(map.values()),
--- a/src/aggregation/bucket/histogram/date_histogram.rs
+++ b/src/aggregation/bucket/histogram/date_histogram.rs
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {
    use pretty_assertions::assert_eq;

    use super::*;
--- a/src/aggregation/bucket/range.rs
+++ b/src/aggregation/bucket/range.rs
@@ -16,6 +16,7 @@ use crate::aggregation::*;
 use crate::TantivyError;

 /// Provide user-defined buckets to aggregate on.
+///
 /// Two special buckets will automatically be created to cover the whole range of values.
 /// The provided buckets have to be continuous.
 /// During the aggregation, the values extracted from the fast_field `field` will be checked
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -1232,8 +1232,8 @@ mod tests {
    #[test]
    fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
        let terms_per_segment = vec![
-            vec!["terma", "terma", "termb", "termb", "termb", "termc"],
-            vec!["terma", "terma", "termb", "termc", "termc"],
+            vec!["terma", "terma", "termb", "termb", "termb"],
+            vec!["terma", "terma", "termb"],
        ];

        let index = get_test_index_from_terms(false, &terms_per_segment)?;
@@ -1255,8 +1255,6 @@ mod tests {
        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
        assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
        assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
-        assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
-        assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
        assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -180,7 +180,7 @@ pub(crate) fn deserialize_option_f64<'de, D>(deserializer: D) -> Result<Option<f
 where D: Deserializer<'de> {
    struct StringOrFloatVisitor;

-    impl<'de> Visitor<'de> for StringOrFloatVisitor {
+    impl Visitor<'_> for StringOrFloatVisitor {
        type Value = Option<f64>;

        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
@@ -226,7 +226,7 @@ pub(crate) fn deserialize_f64<'de, D>(deserializer: D) -> Result<f64, D::Error>
 where D: Deserializer<'de> {
    struct StringOrFloatVisitor;

-    impl<'de> Visitor<'de> for StringOrFloatVisitor {
+    impl Visitor<'_> for StringOrFloatVisitor {
        type Value = f64;

        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -13,7 +13,7 @@ struct Hit<'a> {
    facet: &'a Facet,
 }

-impl<'a> Eq for Hit<'a> {}
+impl Eq for Hit<'_> {}

 impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
    fn eq(&self, other: &Hit<'_>) -> bool {
@@ -27,7 +27,7 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
    }
 }

-impl<'a> Ord for Hit<'a> {
+impl Ord for Hit<'_> {
    fn cmp(&self, other: &Self) -> Ordering {
        other
            .count
--- a/src/collector/filter_collector_wrapper.rs
+++ b/src/collector/filter_collector_wrapper.rs
@@ -182,6 +182,7 @@ where
 }

 /// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
+///
 /// it transparently wraps an inner [`Collector`] but filters documents
 /// based on the result of applying the predicate to the bytes fast field.
 ///
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -495,4 +495,4 @@ where
 impl_downcast!(Fruit);

 #[cfg(test)]
-pub mod tests;
+pub(crate) mod tests;
--- a/src/collector/multi_collector.rs
+++ b/src/collector/multi_collector.rs
@@ -161,7 +161,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
 /// # Ok(())
 /// # }
 /// ```
-#[allow(clippy::type_complexity)]
+#[expect(clippy::type_complexity)]
 #[derive(Default)]
 pub struct MultiCollector<'a> {
    collector_wrappers: Vec<
@@ -190,7 +190,7 @@ impl<'a> MultiCollector<'a> {
    }
 }

-impl<'a> Collector for MultiCollector<'a> {
+impl Collector for MultiCollector<'_> {
    type Fruit = MultiFruit;
    type Child = MultiCollectorChild;

--- a/src/compat_tests.rs
+++ b/src/compat_tests.rs
@@ -44,8 +44,19 @@ fn test_format_6() {
    assert_date_time_precision(&index, DateTimePrecision::Microseconds);
 }

+/// feature flag quickwit uses a different dictionary type
+#[test]
 #[cfg(not(feature = "quickwit"))]
-fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
+fn test_format_7() {
+    let path = path_for_version("7");
+
+    let index = Index::open_in_dir(path).expect("Failed to open index");
+    // dates are not truncated in v7 in the docstore
+    assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
+}
+
+#[cfg(not(feature = "quickwit"))]
+fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
    use collector::TopDocs;
    let reader = index.reader().expect("Failed to create reader");
    let searcher = reader.searcher();
@@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
        .as_datetime()
        .unwrap();

-    let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
+    let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
    assert_eq!(date_value, expected,);
 }
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -71,7 +71,7 @@ pub fn json_path_sep_to_dot(path: &mut str) {
    }
 }

-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 fn index_json_object<'a, V: Value<'a>>(
    doc: DocId,
    json_visitor: V::ObjectIter,
@@ -101,7 +101,7 @@ fn index_json_object<'a, V: Value<'a>>(
    }
 }

-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn index_json_value<'a, V: Value<'a>>(
    doc: DocId,
    json_value: V,
--- a/src/directory/directory.rs
+++ b/src/directory/directory.rs
@@ -39,7 +39,7 @@ impl RetryPolicy {
 /// The `DirectoryLock` is an object that represents a file lock.
 ///
 /// It is associated with a lock file, that gets deleted on `Drop.`
-#[allow(dead_code)]
+#[expect(dead_code)]
 pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);

 struct DirectoryLockGuard {
--- a/src/directory/directory_lock.rs
+++ b/src/directory/directory_lock.rs
@@ -48,6 +48,7 @@ pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
 });
 /// The meta lock file is here to protect the segment files being opened by
 /// `IndexReader::reload()` from being garbage collected.
+///
 /// It makes it possible for another process to safely consume
 /// our index in-writing. Ideally, we may have preferred `RWLock` semantics
 /// here, but it is difficult to achieve on Windows.
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -244,7 +244,7 @@ impl MmapDirectory {
                directory_path,
            )));
        }
-        #[allow(clippy::bind_instead_of_map)]
+        #[expect(clippy::bind_instead_of_map)]
        let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
            let directory_path = directory_path.to_owned();

--- a/src/directory/watch_event_router.rs
+++ b/src/directory/watch_event_router.rs
@@ -32,7 +32,7 @@ pub struct WatchCallbackList {
 /// file change is detected.
 #[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
 #[derive(Clone)]
-#[allow(dead_code)]
+#[expect(dead_code)]
 pub struct WatchHandle(Arc<WatchCallback>);

 impl WatchHandle {
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -117,7 +117,7 @@ pub trait DocSet: Send {
    }
 }

-impl<'a> DocSet for &'a mut dyn DocSet {
+impl DocSet for &mut dyn DocSet {
    fn advance(&mut self) -> u32 {
        (**self).advance()
    }
--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -149,7 +149,7 @@ impl FieldNormReader {
    }

    #[cfg(test)]
-    pub fn for_test(field_norms: &[u32]) -> FieldNormReader {
+    pub(crate) fn for_test(field_norms: &[u32]) -> FieldNormReader {
        let field_norms_id = field_norms
            .iter()
            .cloned()
--- a/src/functional_test.rs
+++ b/src/functional_test.rs
@@ -1,12 +1,9 @@
-#![allow(deprecated)] // Remove with index sorting
-
 use std::collections::HashSet;

 use rand::{thread_rng, Rng};

 use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
 use crate::schema::*;
-#[allow(deprecated)]
 use crate::{doc, schema, Index, IndexWriter, Searcher};

 fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
--- a/src/index/inverted_index_reader.rs
+++ b/src/index/inverted_index_reader.rs
@@ -31,7 +31,6 @@ pub struct InvertedIndexReader {
 }

 impl InvertedIndexReader {
-    #[allow(clippy::needless_pass_by_value)] // for symmetry
    pub(crate) fn new(
        termdict: TermDictionary,
        postings_file_slice: FileSlice,
@@ -205,16 +204,6 @@ impl InvertedIndexReader {
            .transpose()
    }

-    pub(crate) fn read_postings_no_deletes(
-        &self,
-        term: &Term,
-        option: IndexRecordOption,
-    ) -> io::Result<Option<SegmentPostings>> {
-        self.get_term_info(term)?
-            .map(|term_info| self.read_postings_from_terminfo(&term_info, option))
-            .transpose()
-    }
-
    /// Returns the number of documents containing the term.
    pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
        Ok(self
--- a/src/index/segment_component.rs
+++ b/src/index/segment_component.rs
@@ -1,6 +1,7 @@
 use std::slice;

 /// Enum describing each component of a tantivy segment.
+///
 /// Each component is stored in its own file,
 /// using the pattern `segment_uuid`.`component_extension`,
 /// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
--- a/src/index/segment_reader.rs
+++ b/src/index/segment_reader.rs
@@ -478,7 +478,7 @@ pub fn merge_field_meta_data(
        .into_iter()
        .kmerge_by(|left, right| left < right)
        // TODO: Remove allocation
-        .group_by(|el| (el.field_name.to_string(), el.typ))
+        .chunk_by(|el| (el.field_name.to_string(), el.typ))
    {
        let mut merged: FieldMetadata = group.next().unwrap();
        for el in group {
--- a/src/indexer/delete_queue.rs
+++ b/src/indexer/delete_queue.rs
@@ -187,7 +187,6 @@ impl DeleteCursor {
        }
    }

-    #[allow(clippy::wrong_self_convention)]
    fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
        self.get()
            .map(|operation| operation.opstamp < target_opstamp)
--- a/src/indexer/doc_opstamp_mapping.rs
+++ b/src/indexer/doc_opstamp_mapping.rs
@@ -21,7 +21,7 @@ pub enum DocToOpstampMapping<'a> {
    None,
 }

-impl<'a> DocToOpstampMapping<'a> {
+impl DocToOpstampMapping<'_> {
    /// Assess whether a document should be considered deleted given that it contains
    /// a deleted term that was deleted at the opstamp: `delete_opstamp`.
    ///
--- a/src/indexer/log_merge_policy.rs
+++ b/src/indexer/log_merge_policy.rs
@@ -104,7 +104,7 @@ impl MergePolicy for LogMergePolicy {

        let mut current_max_log_size = f64::MAX;
        let mut levels = vec![];
-        for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
+        for (_, merge_group) in &size_sorted_segments.into_iter().chunk_by(|segment| {
            let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
            if segment_log_size < (current_max_log_size - self.level_log_size) {
                // update current_max_log_size to create a new group
--- a/src/indexer/merge_policy.rs
+++ b/src/indexer/merge_policy.rs
@@ -36,7 +36,7 @@ impl MergePolicy for NoMergePolicy {
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use super::*;

--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -150,7 +150,7 @@ impl SegmentWriter {
        let vals_grouped_by_field = doc
            .iter_fields_and_values()
            .sorted_by_key(|(field, _)| *field)
-            .group_by(|(field, _)| *field);
+            .chunk_by(|(field, _)| *field);

        for (field, field_values) in &vals_grouped_by_field {
            let values = field_values.map(|el| el.1);
--- a/src/indexer/stamper.rs
+++ b/src/indexer/stamper.rs
@@ -101,7 +101,7 @@ mod test {

    use super::Stamper;

-    #[allow(clippy::redundant_clone)]
+    #[expect(clippy::redundant_clone)]
    #[test]
    fn test_stamper() {
        let stamper = Stamper::new(7u64);
@@ -117,7 +117,7 @@ mod test {
        assert_eq!(stamper.stamp(), 15u64);
    }

-    #[allow(clippy::redundant_clone)]
+    #[expect(clippy::redundant_clone)]
    #[test]
    fn test_stamper_revert() {
        let stamper = Stamper::new(7u64);
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -178,10 +178,8 @@ pub use crate::future_result::FutureResult;
 pub type Result<T> = std::result::Result<T, TantivyError>;

 mod core;
-#[allow(deprecated)] // Remove with index sorting
 pub mod indexer;

-#[allow(unused_doc_comments)]
 pub mod error;
 pub mod tokenizer;

@@ -190,7 +188,6 @@ pub mod collector;
 pub mod directory;
 pub mod fastfield;
 pub mod fieldnorm;
-#[allow(deprecated)] // Remove with index sorting
 pub mod index;
 pub mod positions;
 pub mod postings;
@@ -223,7 +220,6 @@ pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
 pub use crate::core::json_utils;
 pub use crate::core::{Executor, Searcher, SearcherGeneration};
 pub use crate::directory::Directory;
-#[allow(deprecated)] // Remove with index sorting
 pub use crate::index::{
    Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
    SegmentMeta, SegmentReader,
@@ -232,7 +228,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
 pub use crate::schema::{Document, TantivyDocument, Term};

 /// Index format version.
-pub const INDEX_FORMAT_VERSION: u32 = 6;
+pub const INDEX_FORMAT_VERSION: u32 = 7;
 /// Oldest index format version this tantivy version can read.
 pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;

@@ -371,6 +367,7 @@ macro_rules! fail_point {
    }};
 }

+/// Common test utilities.
 #[cfg(test)]
 pub mod tests {
    use common::{BinarySerializable, FixedSize};
@@ -389,6 +386,7 @@ pub mod tests {
    use crate::schema::*;
    use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};

+    /// Asserts that the serialized value is the value in the trait.
    pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
        let mut buffer = Vec::new();
        O::default().serialize(&mut buffer).unwrap();
@@ -421,6 +419,7 @@ pub mod tests {
        }};
    }

+    /// Generates random numbers
    pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
        let seed: [u8; 32] = [1; 32];
        StdRng::from_seed(seed)
@@ -429,6 +428,7 @@ pub mod tests {
            .collect::<Vec<u32>>()
    }

+    /// Sample `n` elements with Bernoulli distribution.
    pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
        StdRng::from_seed([seed_val; 32])
            .sample_iter(&Bernoulli::new(ratio).unwrap())
@@ -438,6 +438,7 @@ pub mod tests {
            .collect()
    }

+    /// Sample `n` elements with Bernoulli distribution.
    pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
        sample_with_seed(n, ratio, 4)
    }
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -41,7 +41,6 @@
 /// );
 /// # }
 /// ```
-
 #[macro_export]
 macro_rules! doc(
    () => {
--- a/src/positions/mod.rs
+++ b/src/positions/mod.rs
@@ -1,4 +1,5 @@
 //! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
+//!
 //! This position is expressed as token ordinal. For instance,
 //! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
 //! This information is useful to run phrase queries.
@@ -38,7 +39,7 @@ pub use self::serializer::PositionSerializer;
 const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use std::iter;

--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -264,7 +264,7 @@ impl VIntDecoder for BlockDecoder {
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use super::*;
    use crate::TERMINATED;
--- a/src/postings/loaded_postings.rs
+++ b/src/postings/loaded_postings.rs
@@ -0,0 +1,155 @@
+use crate::docset::{DocSet, TERMINATED};
+use crate::postings::{Postings, SegmentPostings};
+use crate::DocId;
+
+/// `LoadedPostings` is a `DocSet` and `Postings` implementation.
+/// It is used to represent the postings of a term in memory.
+/// It is suitable if there are few documents for a term.
+///
+/// It exists mainly to reduce memory usage.
+/// `SegmentPostings` uses 1840 bytes per instance due to its caches.
+/// If you need to keep many terms around with few docs, it's cheaper to load all the
+/// postings in memory.
+///
+/// This is relevant for `RegexPhraseQuery`, which may have a lot of
+/// terms.
+/// E.g. 100_000 terms would need 184MB due to SegmentPostings.
+pub struct LoadedPostings {
+    doc_ids: Box<[DocId]>,
+    position_offsets: Box<[u32]>,
+    positions: Box<[u32]>,
+    cursor: usize,
+}
+
+impl LoadedPostings {
+    /// Creates a new `LoadedPostings` from a `SegmentPostings`.
+    ///
+    /// It will also preload positions, if positions are available in the SegmentPostings.
+    pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings {
+        let num_docs = segment_postings.doc_freq() as usize;
+        let mut doc_ids = Vec::with_capacity(num_docs);
+        let mut positions = Vec::with_capacity(num_docs);
+        let mut position_offsets = Vec::with_capacity(num_docs);
+        while segment_postings.doc() != TERMINATED {
+            position_offsets.push(positions.len() as u32);
+            doc_ids.push(segment_postings.doc());
+            segment_postings.append_positions_with_offset(0, &mut positions);
+            segment_postings.advance();
+        }
+        position_offsets.push(positions.len() as u32);
+        LoadedPostings {
+            doc_ids: doc_ids.into_boxed_slice(),
+            positions: positions.into_boxed_slice(),
+            position_offsets: position_offsets.into_boxed_slice(),
+            cursor: 0,
+        }
+    }
+}
+
+#[cfg(test)]
+impl From<(Vec<DocId>, Vec<Vec<u32>>)> for LoadedPostings {
+    fn from(doc_ids_and_positions: (Vec<DocId>, Vec<Vec<u32>>)) -> LoadedPostings {
+        let mut position_offsets = Vec::new();
+        let mut all_positions = Vec::new();
+        let (doc_ids, docid_positions) = doc_ids_and_positions;
+        for positions in docid_positions {
+            position_offsets.push(all_positions.len() as u32);
+            all_positions.extend_from_slice(&positions);
+        }
+        position_offsets.push(all_positions.len() as u32);
+        LoadedPostings {
+            doc_ids: doc_ids.into_boxed_slice(),
+            positions: all_positions.into_boxed_slice(),
+            position_offsets: position_offsets.into_boxed_slice(),
+            cursor: 0,
+        }
+    }
+}
+
+impl DocSet for LoadedPostings {
+    fn advance(&mut self) -> DocId {
+        self.cursor += 1;
+        if self.cursor >= self.doc_ids.len() {
+            self.cursor = self.doc_ids.len();
+            return TERMINATED;
+        }
+        self.doc()
+    }
+
+    fn doc(&self) -> DocId {
+        if self.cursor >= self.doc_ids.len() {
+            return TERMINATED;
+        }
+        self.doc_ids[self.cursor]
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.doc_ids.len() as u32
+    }
+}
+impl Postings for LoadedPostings {
+    fn term_freq(&self) -> u32 {
+        let start = self.position_offsets[self.cursor] as usize;
+        let end = self.position_offsets[self.cursor + 1] as usize;
+        (end - start) as u32
+    }
+
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+        let start = self.position_offsets[self.cursor] as usize;
+        let end = self.position_offsets[self.cursor + 1] as usize;
+        for pos in &self.positions[start..end] {
+            output.push(*pos + offset);
+        }
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+
+    use super::*;
+
+    #[test]
+    pub fn test_vec_postings() {
+        let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
+        let mut postings = LoadedPostings::from((doc_ids, vec![]));
+        assert_eq!(postings.doc(), 0u32);
+        assert_eq!(postings.advance(), 3u32);
+        assert_eq!(postings.doc(), 3u32);
+        assert_eq!(postings.seek(14u32), 15u32);
+        assert_eq!(postings.doc(), 15u32);
+        assert_eq!(postings.seek(300u32), 300u32);
+        assert_eq!(postings.doc(), 300u32);
+        assert_eq!(postings.seek(6000u32), TERMINATED);
+    }
+
+    #[test]
+    pub fn test_vec_postings2() {
+        let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
+        let mut positions = Vec::new();
+        positions.resize(1024, Vec::new());
+        positions[0] = vec![1u32, 2u32, 3u32];
+        positions[1] = vec![30u32];
+        positions[2] = vec![10u32];
+        positions[4] = vec![50u32];
+        let mut postings = LoadedPostings::from((doc_ids, positions));
+
+        let load = |postings: &mut LoadedPostings| {
+            let mut loaded_positions = Vec::new();
+            postings.positions(loaded_positions.as_mut());
+            loaded_positions
+        };
+        assert_eq!(postings.doc(), 0u32);
+        assert_eq!(load(&mut postings), vec![1u32, 2u32, 3u32]);
+
+        assert_eq!(postings.advance(), 3u32);
+        assert_eq!(postings.doc(), 3u32);
+
+        assert_eq!(load(&mut postings), vec![30u32]);
+
+        assert_eq!(postings.seek(14u32), 15u32);
+        assert_eq!(postings.doc(), 15u32);
+        assert_eq!(postings.seek(300u32), 300u32);
+        assert_eq!(postings.doc(), 300u32);
+        assert_eq!(postings.seek(6000u32), TERMINATED);
+    }
+}
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -8,6 +8,7 @@ mod block_segment_postings;
 pub(crate) mod compression;
 mod indexing_context;
 mod json_postings_writer;
+mod loaded_postings;
 mod per_field_postings_writer;
 mod postings;
 mod postings_writer;
@@ -17,6 +18,7 @@ mod serializer;
 mod skip;
 mod term_info;

+pub(crate) use loaded_postings::LoadedPostings;
 pub(crate) use stacker::compute_table_memory_size;

 pub use self::block_segment_postings::BlockSegmentPostings;
@@ -29,7 +31,7 @@ pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
 pub(crate) use self::skip::{BlockInfo, SkipReader};
 pub use self::term_info::TermInfo;

-#[allow(clippy::enum_variant_names)]
+#[expect(clippy::enum_variant_names)]
 #[derive(Debug, PartialEq, Clone, Copy, Eq)]
 pub(crate) enum FreqReadingOption {
    NoFreq,
@@ -38,7 +40,7 @@ pub(crate) enum FreqReadingOption {
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {
    use std::mem;

    use super::{InvertedIndexSerializer, Postings};
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -17,7 +17,14 @@ pub trait Postings: DocSet + 'static {
    /// Returns the positions offsetted with a given value.
    /// It is not necessary to clear the `output` before calling this method.
    /// The output vector will be resized to the `term_freq`.
-    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
+    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+        output.clear();
+        self.append_positions_with_offset(offset, output);
+    }
+
+    /// Returns the positions offsetted with a given value.
+    /// Data will be appended to the output.
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

    /// Returns the positions of the term in the given document.
    /// The output vector will be resized to the `term_freq`.
@@ -25,3 +32,13 @@ pub trait Postings: DocSet + 'static {
        self.positions_with_offset(0u32, output);
    }
 }
+
+impl Postings for Box<dyn Postings> {
+    fn term_freq(&self) -> u32 {
+        (**self).term_freq()
+    }
+
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+        (**self).append_positions_with_offset(offset, output);
+    }
+}
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -34,7 +34,7 @@ impl<'a> VInt32Reader<'a> {
    }
 }

-impl<'a> Iterator for VInt32Reader<'a> {
+impl Iterator for VInt32Reader<'_> {
    type Item = u32;

    fn next(&mut self) -> Option<u32> {
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -237,8 +237,9 @@ impl Postings for SegmentPostings {
        self.block_cursor.freq(self.cur)
    }

-    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
        let term_freq = self.term_freq();
+        let prev_len = output.len();
        if let Some(position_reader) = self.position_reader.as_mut() {
            debug_assert!(
                !self.block_cursor.freqs().is_empty(),
@@ -249,15 +250,14 @@ impl Postings for SegmentPostings {
                    .iter()
                    .cloned()
                    .sum::<u32>() as u64);
-            output.resize(term_freq as usize, 0u32);
-            position_reader.read(read_offset, &mut output[..]);
+            // TODO: instead of zeroing the output, we could use MaybeUninit or similar.
+            output.resize(prev_len + term_freq as usize, 0u32);
+            position_reader.read(read_offset, &mut output[prev_len..]);
            let mut cum = offset;
-            for output_mut in output.iter_mut() {
+            for output_mut in output[prev_len..].iter_mut() {
                cum += *output_mut;
                *output_mut = cum;
            }
-        } else {
-            output.clear();
        }
    }
 }
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -6,6 +6,7 @@ use tantivy_fst::Automaton;

 use super::phrase_prefix_query::prefix_end;
 use crate::index::SegmentReader;
+use crate::postings::TermInfo;
 use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
 use crate::schema::{Field, IndexRecordOption};
 use crate::termdict::{TermDictionary, TermStreamer};
@@ -64,6 +65,18 @@ where

        term_stream_builder.into_stream()
    }
+
+    /// Returns the term infos that match the automaton
+    pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result<Vec<TermInfo>> {
+        let inverted_index = reader.inverted_index(self.field)?;
+        let term_dict = inverted_index.terms();
+        let mut term_stream = self.automaton_stream(term_dict)?;
+        let mut term_infos = Vec::new();
+        while term_stream.advance() {
+            term_infos.push(term_stream.value().clone());
+        }
+        Ok(term_infos)
+    }
 }

 impl<A> Weight for AutomatonWeight<A>
--- a/src/query/boolean_query/block_wand.rs
+++ b/src/query/boolean_query/block_wand.rs
@@ -272,7 +272,7 @@ impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
    }
 }

-impl<'a> Deref for TermScorerWithMaxScore<'a> {
+impl Deref for TermScorerWithMaxScore<'_> {
    type Target = TermScorer;

    fn deref(&self) -> &Self::Target {
@@ -280,7 +280,7 @@ impl<'a> Deref for TermScorerWithMaxScore<'a> {
    }
 }

-impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
+impl DerefMut for TermScorerWithMaxScore<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.scorer
    }
@@ -308,7 +308,7 @@ mod tests {

    use crate::query::score_combiner::SumCombiner;
    use crate::query::term_query::TermScorer;
-    use crate::query::{Bm25Weight, Scorer, Union};
+    use crate::query::{Bm25Weight, BufferedUnionScorer, Scorer};
    use crate::{DocId, DocSet, Score, TERMINATED};

    struct Float(Score);
@@ -371,7 +371,7 @@ mod tests {
    fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
        let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
        let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
-        let mut scorer = Union::build(term_scorers, SumCombiner::default);
+        let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default);

        let mut limit = Score::MIN;
        loop {
@@ -417,7 +417,7 @@ mod tests {
            .boxed()
    }

-    #[allow(clippy::type_complexity)]
+    #[expect(clippy::type_complexity)]
    fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
        (1u32..100u32)
            .prop_flat_map(move |max_doc: u32| {
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -9,8 +9,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
 use crate::query::term_query::TermScorer;
 use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
 use crate::query::{
-    intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer,
-    Union, Weight,
+    intersect_scorers, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
+    RequiredOptionalScorer, Scorer, Weight,
 };
 use crate::{DocId, Score};

@@ -65,14 +65,17 @@ where
                // Block wand is only available if we read frequencies.
                return SpecializedScorer::TermUnion(scorers);
            } else {
-                return SpecializedScorer::Other(Box::new(Union::build(
+                return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
                    scorers,
                    score_combiner_fn,
                )));
            }
        }
    }
-    SpecializedScorer::Other(Box::new(Union::build(scorers, score_combiner_fn)))
+    SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
+        scorers,
+        score_combiner_fn,
+    )))
 }

 fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
@@ -81,7 +84,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
 ) -> Box<dyn Scorer> {
    match scorer {
        SpecializedScorer::TermUnion(term_scorers) => {
-            let union_scorer = Union::build(term_scorers, score_combiner_fn);
+            let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn);
            Box::new(union_scorer)
        }
        SpecializedScorer::Other(scorer) => scorer,
@@ -296,7 +299,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
        let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
        match scorer {
            SpecializedScorer::TermUnion(term_scorers) => {
-                let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
+                let mut union_scorer =
+                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
                for_each_scorer(&mut union_scorer, callback);
            }
            SpecializedScorer::Other(mut scorer) => {
@@ -316,7 +320,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin

        match scorer {
            SpecializedScorer::TermUnion(term_scorers) => {
-                let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
+                let mut union_scorer =
+                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
                for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
            }
            SpecializedScorer::Other(mut scorer) => {
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -51,6 +51,7 @@ pub use self::fuzzy_query::FuzzyTermQuery;
 pub use self::intersection::{intersect_scorers, Intersection};
 pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
 pub use self::phrase_prefix_query::PhrasePrefixQuery;
+pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery};
 pub use self::phrase_query::PhraseQuery;
 pub use self::query::{EnableScoring, Query, QueryClone};
 pub use self::query_parser::{QueryParser, QueryParserError};
@@ -61,7 +62,7 @@ pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombine
 pub use self::scorer::Scorer;
 pub use self::set_query::TermSetQuery;
 pub use self::term_query::TermQuery;
-pub use self::union::Union;
+pub use self::union::BufferedUnionScorer;
 #[cfg(test)]
 pub use self::vec_docset::VecDocSet;
 pub use self::weight::Weight;
--- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
@@ -8,7 +8,7 @@ use crate::{DocId, Score};

 // MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
 // though, it would be interesting to slim it down if possible.
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 enum PhraseKind<TPostings: Postings> {
    SinglePrefix {
        position_offset: u32,
--- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs
@@ -53,27 +53,14 @@ impl PhrasePrefixWeight {
            .map(|similarity_weight| similarity_weight.boost_by(boost));
        let fieldnorm_reader = self.fieldnorm_reader(reader)?;
        let mut term_postings_list = Vec::new();
-        if reader.has_deletes() {
-            for &(offset, ref term) in &self.phrase_terms {
-                if let Some(postings) = reader
-                    .inverted_index(term.field())?
-                    .read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
-                {
-                    term_postings_list.push((offset, postings));
-                } else {
-                    return Ok(None);
-                }
-            }
-        } else {
-            for &(offset, ref term) in &self.phrase_terms {
-                if let Some(postings) = reader
-                    .inverted_index(term.field())?
-                    .read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
-                {
-                    term_postings_list.push((offset, postings));
-                } else {
-                    return Ok(None);
-                }
+        for &(offset, ref term) in &self.phrase_terms {
+            if let Some(postings) = reader
+                .inverted_index(term.field())?
+                .read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
+            {
+                term_postings_list.push((offset, postings));
+            } else {
+                return Ok(None);
            }
        }

@@ -109,8 +96,8 @@ impl PhrasePrefixWeight {
                {
                    suffixes.push(postings);
                }
-            } else if let Some(postings) = inv_index
-                .read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)?
+            } else if let Some(postings) =
+                inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)?
            {
                suffixes.push(postings);
            }
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -1,6 +1,8 @@
 mod phrase_query;
 mod phrase_scorer;
 mod phrase_weight;
+pub mod regex_phrase_query;
+mod regex_phrase_weight;

 pub use self::phrase_query::PhraseQuery;
 pub(crate) use self::phrase_scorer::intersection_count;
@@ -8,7 +10,7 @@ pub use self::phrase_scorer::PhraseScorer;
 pub use self::phrase_weight::PhraseWeight;

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use serde_json::json;

@@ -19,15 +21,15 @@ pub mod tests {
    use crate::schema::{Schema, Term, TEXT};
    use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};

-    pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
+    pub fn create_index<S: AsRef<str>>(texts: &[S]) -> crate::Result<Index> {
        let mut schema_builder = Schema::builder();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema);
        {
            let mut index_writer: IndexWriter = index.writer_for_tests()?;
-            for &text in texts {
-                let doc = doc!(text_field=>text);
+            for text in texts {
+                let doc = doc!(text_field=>text.as_ref());
                index_writer.add_document(doc)?;
            }
            index_writer.commit()?;
--- a/src/query/phrase_query/phrase_weight.rs
+++ b/src/query/phrase_query/phrase_weight.rs
@@ -50,27 +50,14 @@ impl PhraseWeight {
            .map(|similarity_weight| similarity_weight.boost_by(boost));
        let fieldnorm_reader = self.fieldnorm_reader(reader)?;
        let mut term_postings_list = Vec::new();
-        if reader.has_deletes() {
-            for &(offset, ref term) in &self.phrase_terms {
-                if let Some(postings) = reader
-                    .inverted_index(term.field())?
-                    .read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
-                {
-                    term_postings_list.push((offset, postings));
-                } else {
-                    return Ok(None);
-                }
-            }
-        } else {
-            for &(offset, ref term) in &self.phrase_terms {
-                if let Some(postings) = reader
-                    .inverted_index(term.field())?
-                    .read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
-                {
-                    term_postings_list.push((offset, postings));
-                } else {
-                    return Ok(None);
-                }
+        for &(offset, ref term) in &self.phrase_terms {
+            if let Some(postings) = reader
+                .inverted_index(term.field())?
+                .read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
+            {
+                term_postings_list.push((offset, postings));
+            } else {
+                return Ok(None);
            }
        }
        Ok(Some(PhraseScorer::new(
--- a/src/query/phrase_query/regex_phrase_query.rs
+++ b/src/query/phrase_query/regex_phrase_query.rs
@@ -0,0 +1,172 @@
+use super::regex_phrase_weight::RegexPhraseWeight;
+use crate::query::bm25::Bm25Weight;
+use crate::query::{EnableScoring, Query, Weight};
+use crate::schema::{Field, IndexRecordOption, Term, Type};
+
+/// `RegexPhraseQuery` matches a specific sequence of regex queries.
+///
+/// For instance, the phrase query for `"pa.* time"` will match
+/// the sentence:
+///
+/// **Alan just got a part time job.**
+///
+/// On the other hand it will not match the sentence.
+///
+/// **This is my favorite part of the job.**
+///
+/// [Slop](RegexPhraseQuery::set_slop) allows leniency in term proximity
+/// for some performance trade-off.
+///
+/// Using a `RegexPhraseQuery` on a field requires positions
+/// to be indexed for this field.
+#[derive(Clone, Debug)]
+pub struct RegexPhraseQuery {
+    field: Field,
+    phrase_terms: Vec<(usize, String)>,
+    slop: u32,
+    max_expansions: u32,
+}
+
+/// Transform a wildcard query to a regex string.
+///
+/// `AB*CD` for example is converted to `AB.*CD`
+///
+/// All other chars are regex escaped.
+pub fn wildcard_query_to_regex_str(term: &str) -> String {
+    regex::escape(term).replace(r"\*", ".*")
+}
+
+impl RegexPhraseQuery {
+    /// Creates a new `RegexPhraseQuery` given a list of terms.
+    ///
+    /// There must be at least two terms, and all terms
+    /// must belong to the same field.
+    ///
+    /// Offset for each term will be same as index in the Vector
+    pub fn new(field: Field, terms: Vec<String>) -> RegexPhraseQuery {
+        let terms_with_offset = terms.into_iter().enumerate().collect();
+        RegexPhraseQuery::new_with_offset(field, terms_with_offset)
+    }
+
+    /// Creates a new `RegexPhraseQuery` given a list of terms and their offsets.
+    ///
+    /// Can be used to provide custom offset for each term.
+    pub fn new_with_offset(field: Field, terms: Vec<(usize, String)>) -> RegexPhraseQuery {
+        RegexPhraseQuery::new_with_offset_and_slop(field, terms, 0)
+    }
+
+    /// Creates a new `RegexPhraseQuery` given a list of terms, their offsets and a slop
+    pub fn new_with_offset_and_slop(
+        field: Field,
+        mut terms: Vec<(usize, String)>,
+        slop: u32,
+    ) -> RegexPhraseQuery {
+        assert!(
+            terms.len() > 1,
+            "A phrase query is required to have strictly more than one term."
+        );
+        terms.sort_by_key(|&(offset, _)| offset);
+        RegexPhraseQuery {
+            field,
+            phrase_terms: terms,
+            slop,
+            max_expansions: 1 << 14,
+        }
+    }
+
+    /// Slop allowed for the phrase.
+    ///
+    /// The query will match if its terms are separated by `slop` terms at most.
+    /// The slop can be considered a budget between all terms.
+    /// E.g. "A B C" with slop 1 allows "A X B C", "A B X C", but not "A X B X C".
+    ///
+    /// Transposition costs 2, e.g. "A B" with slop 1 will not match "B A" but it would with slop 2
+    /// Transposition is not a special case, in the example above A is moved 1 position and B is
+    /// moved 1 position, so the slop is 2.
+    ///
+    /// As a result slop works in both directions, so the order of the terms may changed as long as
+    /// they respect the slop.
+    ///
+    /// By default the slop is 0 meaning query terms need to be adjacent.
+    pub fn set_slop(&mut self, value: u32) {
+        self.slop = value;
+    }
+
+    /// Sets the max expansions a regex term can match. The limit will be over all terms.
+    /// After the limit is hit an error will be returned.
+    pub fn set_max_expansions(&mut self, value: u32) {
+        self.max_expansions = value;
+    }
+
+    /// The [`Field`] this `RegexPhraseQuery` is targeting.
+    pub fn field(&self) -> Field {
+        self.field
+    }
+
+    /// `Term`s in the phrase without the associated offsets.
+    pub fn phrase_terms(&self) -> Vec<Term> {
+        self.phrase_terms
+            .iter()
+            .map(|(_, term)| Term::from_field_text(self.field, term))
+            .collect::<Vec<Term>>()
+    }
+
+    /// Returns the [`RegexPhraseWeight`] for the given phrase query given a specific `searcher`.
+    ///
+    /// This function is the same as [`Query::weight()`] except it returns
+    /// a specialized type [`RegexPhraseWeight`] instead of a Boxed trait.
+    pub(crate) fn regex_phrase_weight(
+        &self,
+        enable_scoring: EnableScoring<'_>,
+    ) -> crate::Result<RegexPhraseWeight> {
+        let schema = enable_scoring.schema();
+        let field_type = schema.get_field_entry(self.field).field_type().value_type();
+        if field_type != Type::Str {
+            return Err(crate::TantivyError::SchemaError(format!(
+                "RegexPhraseQuery can only be used with a field of type text currently, but got \
+                 {:?}",
+                field_type
+            )));
+        }
+
+        let field_entry = schema.get_field_entry(self.field);
+        let has_positions = field_entry
+            .field_type()
+            .get_index_record_option()
+            .map(IndexRecordOption::has_positions)
+            .unwrap_or(false);
+        if !has_positions {
+            let field_name = field_entry.name();
+            return Err(crate::TantivyError::SchemaError(format!(
+                "Applied phrase query on field {field_name:?}, which does not have positions \
+                 indexed"
+            )));
+        }
+        let terms = self.phrase_terms();
+        let bm25_weight_opt = match enable_scoring {
+            EnableScoring::Enabled {
+                statistics_provider,
+                ..
+            } => Some(Bm25Weight::for_terms(statistics_provider, &terms)?),
+            EnableScoring::Disabled { .. } => None,
+        };
+        let weight = RegexPhraseWeight::new(
+            self.field,
+            self.phrase_terms.clone(),
+            bm25_weight_opt,
+            self.max_expansions,
+            self.slop,
+        );
+        Ok(weight)
+    }
+}
+
+impl Query for RegexPhraseQuery {
+    /// Create the weight associated with a query.
+    ///
+    /// See [`Weight`].
+    fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
+        let phrase_weight = self.regex_phrase_weight(enable_scoring)?;
+        Ok(Box::new(phrase_weight))
+    }
+}
--- a/src/query/phrase_query/regex_phrase_weight.rs
+++ b/src/query/phrase_query/regex_phrase_weight.rs
@@ -0,0 +1,475 @@
+use std::sync::Arc;
+
+use common::BitSet;
+use tantivy_fst::Regex;
+
+use super::PhraseScorer;
+use crate::fieldnorm::FieldNormReader;
+use crate::index::SegmentReader;
+use crate::postings::{LoadedPostings, Postings, SegmentPostings, TermInfo};
+use crate::query::bm25::Bm25Weight;
+use crate::query::explanation::does_not_match;
+use crate::query::union::{BitSetPostingUnion, SimpleUnion};
+use crate::query::{AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight};
+use crate::schema::{Field, IndexRecordOption};
+use crate::{DocId, DocSet, InvertedIndexReader, Score};
+
+type UnionType = SimpleUnion<Box<dyn Postings + 'static>>;
+
+/// The `RegexPhraseWeight` is the weight associated to a regex phrase query.
+/// See RegexPhraseWeight::get_union_from_term_infos for some design decisions.
+pub struct RegexPhraseWeight {
+    field: Field,
+    phrase_terms: Vec<(usize, String)>,
+    similarity_weight_opt: Option<Bm25Weight>,
+    slop: u32,
+    max_expansions: u32,
+}
+
+impl RegexPhraseWeight {
+    /// Creates a new phrase weight.
+    /// If `similarity_weight_opt` is None, then scoring is disabled
+    pub fn new(
+        field: Field,
+        phrase_terms: Vec<(usize, String)>,
+        similarity_weight_opt: Option<Bm25Weight>,
+        max_expansions: u32,
+        slop: u32,
+    ) -> RegexPhraseWeight {
+        RegexPhraseWeight {
+            field,
+            phrase_terms,
+            similarity_weight_opt,
+            slop,
+            max_expansions,
+        }
+    }
+
+    fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
+        if self.similarity_weight_opt.is_some() {
+            if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? {
+                return Ok(fieldnorm_reader);
+            }
+        }
+        Ok(FieldNormReader::constant(reader.max_doc(), 1))
+    }
+
+    pub(crate) fn phrase_scorer(
+        &self,
+        reader: &SegmentReader,
+        boost: Score,
+    ) -> crate::Result<Option<PhraseScorer<UnionType>>> {
+        let similarity_weight_opt = self
+            .similarity_weight_opt
+            .as_ref()
+            .map(|similarity_weight| similarity_weight.boost_by(boost));
+        let fieldnorm_reader = self.fieldnorm_reader(reader)?;
+        let mut posting_lists = Vec::new();
+        let inverted_index = reader.inverted_index(self.field)?;
+        let mut num_terms = 0;
+        for &(offset, ref term) in &self.phrase_terms {
+            let regex = Regex::new(term)
+                .map_err(|e| crate::TantivyError::InvalidArgument(format!("Invalid regex: {e}")))?;
+
+            let automaton: AutomatonWeight<Regex> =
+                AutomatonWeight::new(self.field, Arc::new(regex));
+            let term_infos = automaton.get_match_term_infos(reader)?;
+            // If term_infos is empty, the phrase can not match any documents.
+            if term_infos.is_empty() {
+                return Ok(None);
+            }
+            num_terms += term_infos.len();
+            if num_terms > self.max_expansions as usize {
+                return Err(crate::TantivyError::InvalidArgument(format!(
+                    "Phrase query exceeded max expansions {}",
+                    num_terms
+                )));
+            }
+            let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?;
+
+            posting_lists.push((offset, union));
+        }
+
+        Ok(Some(PhraseScorer::new(
+            posting_lists,
+            similarity_weight_opt,
+            fieldnorm_reader,
+            self.slop,
+        )))
+    }
+
+    /// Add all docs of the term to the docset
+    fn add_to_bitset(
+        inverted_index: &InvertedIndexReader,
+        term_info: &TermInfo,
+        doc_bitset: &mut BitSet,
+    ) -> crate::Result<()> {
+        let mut block_segment_postings = inverted_index
+            .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
+        loop {
+            let docs = block_segment_postings.docs();
+            if docs.is_empty() {
+                break;
+            }
+            for &doc in docs {
+                doc_bitset.insert(doc);
+            }
+            block_segment_postings.advance();
+        }
+        Ok(())
+    }
+
+    /// This function generates a union of document sets from multiple term information
+    /// (`TermInfo`).
+    ///
+    /// It uses bucketing based on term frequency to optimize query performance and memory usage.
+    /// The terms are divided into buckets based on their document frequency (the number of
+    /// documents they appear in).
+    ///
+    /// ### Bucketing Strategy:
+    /// Once a bucket contains more than 512 terms, it is moved to the end of the list and replaced
+    /// with a new empty bucket.
+    ///
+    /// - **Sparse Term Buckets**: Terms with document frequency `< 100`.
+    ///
+    ///   Each sparse bucket contains:
+    ///   - A `BitSet` to efficiently track which document IDs are present in the bucket, which is
+    ///     used to drive the `DocSet`.
+    ///   - A `Vec<LoadedPostings>` to store the postings for each term in that bucket.
+    ///
+    /// - **Other Term Buckets**:
+    ///   - **Bucket 0**: Terms appearing in less than `0.1%` of documents.
+    ///   - **Bucket 1**: Terms appearing in `0.1%` to `1%` of documents.
+    ///   - **Bucket 2**: Terms appearing in `1%` to `10%` of documents.
+    ///   - **Bucket 3**: Terms appearing in more than `10%` of documents.
+    ///
+    ///   Each bucket contains:
+    ///   - A `BitSet` to efficiently track which document IDs are present in the bucket.
+    ///   - A `Vec<SegmentPostings>` to store the postings for each term in that bucket.
+    ///
+    /// ### Design Choices:
+    /// The main cost for a _unbucketed_ regex phrase query with a medium/high amount of terms is
+    /// the `append_positions_with_offset` from `Postings`.
+    /// We don't know which docsets hit, so we need to scan all of them to check if they contain the
+    /// docid.
+    /// The bucketing strategy groups less common DocSets together, so we can rule out the
+    /// whole docset group in many cases.
+    ///
+    /// E.g. consider the phrase "th* world"
+    /// It contains the term "the", which may occur in almost all documents.
+    /// It may also contain 10_000s very rare terms like "theologian".
+    ///
+    /// For very low-frequency terms (sparse terms), we use `LoadedPostings` and aggregate
+    /// their document IDs into a `BitSet`, which is more memory-efficient than using
+    /// `SegmentPostings`. E.g. 100_000 terms with SegmentPostings would consume 184MB.
+    /// `SegmentPostings` uses memory equivalent to 460 docids. The 100 docs limit should be
+    /// fine as long as a term doesn't have too many positions per doc.
+    ///
+    /// ### Future Optimization:
+    /// A larger performance improvement would be an additional partitioning of the space
+    /// vertically of u16::MAX blocks, where we mark which docset ord has values in each block.
+    /// E.g. partitioning in a index with 5 million documents this would reduce the number of
+    /// docsets to scan to around 1/20 in the sparse term bucket where the terms only have a few
+    /// docs. For higher cardinality buckets this is irrelevant as they are in most blocks.
+    ///
+    /// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently.
+    pub(crate) fn get_union_from_term_infos(
+        term_infos: &[TermInfo],
+        reader: &SegmentReader,
+        inverted_index: &InvertedIndexReader,
+    ) -> crate::Result<UnionType> {
+        let max_doc = reader.max_doc();
+
+        // Buckets for sparse terms
+        let mut sparse_buckets: Vec<(BitSet, Vec<LoadedPostings>)> =
+            vec![(BitSet::with_max_value(max_doc), Vec::new())];
+
+        // Buckets for other terms based on document frequency percentages:
+        // - Bucket 0: Terms appearing in less than 0.1% of documents
+        // - Bucket 1: Terms appearing in 0.1% to 1% of documents
+        // - Bucket 2: Terms appearing in 1% to 10% of documents
+        // - Bucket 3: Terms appearing in more than 10% of documents
+        let mut buckets: Vec<(BitSet, Vec<SegmentPostings>)> = (0..4)
+            .map(|_| (BitSet::with_max_value(max_doc), Vec::new()))
+            .collect();
+
+        const SPARSE_TERM_DOC_THRESHOLD: u32 = 100;
+
+        for term_info in term_infos {
+            let mut term_posting = inverted_index
+                .read_postings_from_terminfo(term_info, IndexRecordOption::WithFreqsAndPositions)?;
+            let num_docs = term_posting.doc_freq();
+
+            if num_docs < SPARSE_TERM_DOC_THRESHOLD {
+                let current_bucket = &mut sparse_buckets[0];
+                Self::add_to_bitset(inverted_index, term_info, &mut current_bucket.0)?;
+                let docset = LoadedPostings::load(&mut term_posting);
+                current_bucket.1.push(docset);
+
+                // Move the bucket to the end if the term limit is reached
+                if current_bucket.1.len() == 512 {
+                    sparse_buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
+                    let end_index = sparse_buckets.len() - 1;
+                    sparse_buckets.swap(0, end_index);
+                }
+            } else {
+                // Calculate the percentage of documents the term appears in
+                let doc_freq_percentage = (num_docs as f32) / (max_doc as f32) * 100.0;
+
+                // Determine the appropriate bucket based on percentage thresholds
+                let bucket_index = if doc_freq_percentage < 0.1 {
+                    0
+                } else if doc_freq_percentage < 1.0 {
+                    1
+                } else if doc_freq_percentage < 10.0 {
+                    2
+                } else {
+                    3
+                };
+                let bucket = &mut buckets[bucket_index];
+
+                // Add term postings to the appropriate bucket
+                Self::add_to_bitset(inverted_index, term_info, &mut bucket.0)?;
+                bucket.1.push(term_posting);
+
+                // Move the bucket to the end if the term limit is reached
+                if bucket.1.len() == 512 {
+                    buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
+                    let end_index = buckets.len() - 1;
+                    buckets.swap(bucket_index, end_index);
+                }
+            }
+        }
+
+        // Build unions for sparse term buckets
+        let sparse_term_docsets: Vec<_> = sparse_buckets
+            .into_iter()
+            .filter(|(_, postings)| !postings.is_empty())
+            .map(|(bitset, postings)| {
+                BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
+            })
+            .collect();
+        let sparse_term_unions = SimpleUnion::build(sparse_term_docsets);
+
+        // Build unions for other term buckets
+        let bitset_unions_per_bucket: Vec<_> = buckets
+            .into_iter()
+            .filter(|(_, postings)| !postings.is_empty())
+            .map(|(bitset, postings)| {
+                BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
+            })
+            .collect();
+        let other_union = SimpleUnion::build(bitset_unions_per_bucket);
+
+        let union: SimpleUnion<Box<dyn Postings + 'static>> =
+            SimpleUnion::build(vec![Box::new(sparse_term_unions), Box::new(other_union)]);
+
+        // Return a union of sparse term unions and other term unions
+        Ok(union)
+    }
+}
+
+impl Weight for RegexPhraseWeight {
+    fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
+        if let Some(scorer) = self.phrase_scorer(reader, boost)? {
+            Ok(Box::new(scorer))
+        } else {
+            Ok(Box::new(EmptyScorer))
+        }
+    }
+
+    fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
+        let scorer_opt = self.phrase_scorer(reader, 1.0)?;
+        if scorer_opt.is_none() {
+            return Err(does_not_match(doc));
+        }
+        let mut scorer = scorer_opt.unwrap();
+        if scorer.seek(doc) != doc {
+            return Err(does_not_match(doc));
+        }
+        let fieldnorm_reader = self.fieldnorm_reader(reader)?;
+        let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
+        let phrase_count = scorer.phrase_count();
+        let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
+        if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
+            explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
+        }
+        Ok(explanation)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+    use rand::seq::SliceRandom;
+
+    use super::super::tests::create_index;
+    use crate::docset::TERMINATED;
+    use crate::query::{wildcard_query_to_regex_str, EnableScoring, RegexPhraseQuery};
+    use crate::DocSet;
+
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(50))]
+        #[test]
+        fn test_phrase_regex_with_random_strings(mut random_strings in proptest::collection::vec("[c-z ]{0,10}", 1..100), num_occurrences in 1..150_usize) {
+            let mut rng = rand::thread_rng();
+
+            // Insert "aaa ccc" the specified number of times into the list
+            for _ in 0..num_occurrences {
+                random_strings.push("aaa ccc".to_string());
+            }
+            // Shuffle the list, which now contains random strings and the inserted "aaa ccc"
+            random_strings.shuffle(&mut rng);
+
+            // Compute the positions of "aaa ccc" after the shuffle
+            let aaa_ccc_positions: Vec<usize> = random_strings
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, s)| if s == "aaa ccc" { Some(idx) } else { None })
+                .collect();
+
+            // Create the index with random strings and the fixed string "aaa ccc"
+            let index = create_index(&random_strings.iter().map(AsRef::as_ref).collect::<Vec<&str>>())?;
+            let schema = index.schema();
+            let text_field = schema.get_field("text").unwrap();
+            let searcher = index.reader()?.searcher();
+
+            let phrase_query = RegexPhraseQuery::new(text_field, vec![wildcard_query_to_regex_str("a*"), wildcard_query_to_regex_str("c*")]);
+
+            let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+            let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+            let mut phrase_scorer = phrase_weight
+                .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+                .unwrap();
+
+            // Check if the scorer returns the correct document positions for "aaa ccc"
+            for expected_doc in aaa_ccc_positions {
+                prop_assert_eq!(phrase_scorer.doc(), expected_doc as u32);
+                prop_assert_eq!(phrase_scorer.phrase_count(), 1);
+                phrase_scorer.advance();
+            }
+            prop_assert_eq!(phrase_scorer.advance(), TERMINATED);
+        }
+    }
+
+    #[test]
+    pub fn test_phrase_count() -> crate::Result<()> {
+        let index = create_index(&["a c", "a a b d a b c", " a b"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let phrase_query = RegexPhraseQuery::new(text_field, vec!["a".into(), "b".into()]);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 1);
+        assert_eq!(phrase_scorer.phrase_count(), 2);
+        assert_eq!(phrase_scorer.advance(), 2);
+        assert_eq!(phrase_scorer.doc(), 2);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_wildcard() -> crate::Result<()> {
+        let index = create_index(&["a c", "a aa b d ad b c", " ac b", "bac b"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "b".into()]);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 1);
+        assert_eq!(phrase_scorer.phrase_count(), 2);
+        assert_eq!(phrase_scorer.advance(), 2);
+        assert_eq!(phrase_scorer.doc(), 2);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_regex() -> crate::Result<()> {
+        let index = create_index(&["ba b", "a aa b d ad b c", "bac b"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let phrase_query = RegexPhraseQuery::new(text_field, vec!["b?a.*".into(), "b".into()]);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 0);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), 1);
+        assert_eq!(phrase_scorer.phrase_count(), 2);
+        assert_eq!(phrase_scorer.advance(), 2);
+        assert_eq!(phrase_scorer.doc(), 2);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_regex_with_slop() -> crate::Result<()> {
+        let index = create_index(&["aaa bbb ccc ___ abc ddd bbb ccc"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let mut phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "c.*".into()]);
+        phrase_query.set_slop(1);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 0);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+
+        phrase_query.set_slop(2);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 0);
+        assert_eq!(phrase_scorer.phrase_count(), 2);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_regex_double_wildcard() -> crate::Result<()> {
+        let index = create_index(&["baaab bccccb"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let phrase_query = RegexPhraseQuery::new(
+            text_field,
+            vec![
+                wildcard_query_to_regex_str("*a*"),
+                wildcard_query_to_regex_str("*c*"),
+            ],
+        );
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 0);
+        assert_eq!(phrase_scorer.phrase_count(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+        Ok(())
+    }
+}
--- a/src/query/range_query/range_query_fastfield.rs
+++ b/src/query/range_query/range_query_fastfield.rs
@@ -402,7 +402,7 @@ fn search_on_u64_ff(
    boost: Score,
    bounds: BoundsRange<u64>,
 ) -> crate::Result<Box<dyn Scorer>> {
-    #[allow(clippy::reversed_empty_ranges)]
+    #[expect(clippy::reversed_empty_ranges)]
    let value_range = bound_to_value_range(
        &bounds.lower_bound,
        &bounds.upper_bound,
@@ -1386,7 +1386,7 @@ mod tests {
 }

 #[cfg(test)]
-pub mod ip_range_tests {
+pub(crate) mod ip_range_tests {
    use proptest::prelude::ProptestConfig;
    use proptest::strategy::Strategy;
    use proptest::{prop_oneof, proptest};
--- a/src/query/union/bitset_union.rs
+++ b/src/query/union/bitset_union.rs
@@ -0,0 +1,89 @@
+use std::cell::RefCell;
+
+use crate::docset::DocSet;
+use crate::postings::Postings;
+use crate::query::BitSetDocSet;
+use crate::DocId;
+
+/// Creates a `Posting` that uses the bitset for hits and the docsets for PostingLists.
+///
+/// It is used for the regex phrase query, where we need the union of a large amount of
+/// terms, but need to keep the docsets for the postings.
+pub struct BitSetPostingUnion<TDocSet> {
+    /// The docsets are required to load positions
+    ///
+    /// RefCell because we mutate in term_freq
+    docsets: RefCell<Vec<TDocSet>>,
+    /// The already unionized BitSet of the docsets
+    bitset: BitSetDocSet,
+}
+
+impl<TDocSet: DocSet> BitSetPostingUnion<TDocSet> {
+    pub(crate) fn build(
+        docsets: Vec<TDocSet>,
+        bitset: BitSetDocSet,
+    ) -> BitSetPostingUnion<TDocSet> {
+        BitSetPostingUnion {
+            docsets: RefCell::new(docsets),
+            bitset,
+        }
+    }
+}
+
+impl<TDocSet: Postings> Postings for BitSetPostingUnion<TDocSet> {
+    fn term_freq(&self) -> u32 {
+        let curr_doc = self.bitset.doc();
+        let mut term_freq = 0;
+        let mut docsets = self.docsets.borrow_mut();
+        for docset in docsets.iter_mut() {
+            if docset.doc() < curr_doc {
+                docset.seek(curr_doc);
+            }
+            if docset.doc() == curr_doc {
+                term_freq += docset.term_freq();
+            }
+        }
+        term_freq
+    }
+
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+        let curr_doc = self.bitset.doc();
+        let mut docsets = self.docsets.borrow_mut();
+        for docset in docsets.iter_mut() {
+            if docset.doc() < curr_doc {
+                docset.seek(curr_doc);
+            }
+            if docset.doc() == curr_doc {
+                docset.append_positions_with_offset(offset, output);
+            }
+        }
+        debug_assert!(
+            !output.is_empty(),
+            "this method should only be called if positions are available"
+        );
+        output.sort_unstable();
+        output.dedup();
+    }
+}
+
+impl<TDocSet: DocSet> DocSet for BitSetPostingUnion<TDocSet> {
+    fn advance(&mut self) -> DocId {
+        self.bitset.advance()
+    }
+
+    fn seek(&mut self, target: DocId) -> DocId {
+        self.bitset.seek(target)
+    }
+
+    fn doc(&self) -> DocId {
+        self.bitset.doc()
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.bitset.size_hint()
+    }
+
+    fn count_including_deleted(&mut self) -> u32 {
+        self.bitset.count_including_deleted()
+    }
+}
--- a/src/query/union/buffered_union.rs
+++ b/src/query/union/buffered_union.rs
@@ -26,7 +26,7 @@ where P: FnMut(&mut T) -> bool {
 }

 /// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
-pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
+pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
    docsets: Vec<TScorer>,
    bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
    scores: Box<[TScoreCombiner; HORIZON as usize]>,
@@ -61,16 +61,16 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
    });
 }

-impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
+impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer, TScoreCombiner> {
    pub(crate) fn build(
        docsets: Vec<TScorer>,
        score_combiner_fn: impl FnOnce() -> TScoreCombiner,
-    ) -> Union<TScorer, TScoreCombiner> {
+    ) -> BufferedUnionScorer<TScorer, TScoreCombiner> {
        let non_empty_docsets: Vec<TScorer> = docsets
            .into_iter()
            .filter(|docset| docset.doc() != TERMINATED)
            .collect();
-        let mut union = Union {
+        let mut union = BufferedUnionScorer {
            docsets: non_empty_docsets,
            bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
            scores: Box::new([score_combiner_fn(); HORIZON as usize]),
@@ -121,7 +121,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
    }
 }

-impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
+impl<TScorer, TScoreCombiner> DocSet for BufferedUnionScorer<TScorer, TScoreCombiner>
 where
    TScorer: Scorer,
    TScoreCombiner: ScoreCombiner,
@@ -230,7 +230,7 @@ where
    }
 }

-impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
+impl<TScorer, TScoreCombiner> Scorer for BufferedUnionScorer<TScorer, TScoreCombiner>
 where
    TScoreCombiner: ScoreCombiner,
    TScorer: Scorer,
@@ -239,205 +239,3 @@ where
        self.score
    }
 }
-
-#[cfg(test)]
-mod tests {
-
-    use std::collections::BTreeSet;
-
-    use super::{Union, HORIZON};
-    use crate::docset::{DocSet, TERMINATED};
-    use crate::postings::tests::test_skip_against_unoptimized;
-    use crate::query::score_combiner::DoNothingCombiner;
-    use crate::query::{ConstScorer, VecDocSet};
-    use crate::{tests, DocId};
-
-    fn aux_test_union(vals: Vec<Vec<u32>>) {
-        let mut val_set: BTreeSet<u32> = BTreeSet::new();
-        for vs in &vals {
-            for &v in vs {
-                val_set.insert(v);
-            }
-        }
-        let union_vals: Vec<u32> = val_set.into_iter().collect();
-        let mut union_expected = VecDocSet::from(union_vals);
-        let make_union = || {
-            Union::build(
-                vals.iter()
-                    .cloned()
-                    .map(VecDocSet::from)
-                    .map(|docset| ConstScorer::new(docset, 1.0))
-                    .collect::<Vec<ConstScorer<VecDocSet>>>(),
-                DoNothingCombiner::default,
-            )
-        };
-        let mut union: Union<_, DoNothingCombiner> = make_union();
-        let mut count = 0;
-        while union.doc() != TERMINATED {
-            assert_eq!(union_expected.doc(), union.doc());
-            assert_eq!(union_expected.advance(), union.advance());
-            count += 1;
-        }
-        assert_eq!(union_expected.advance(), TERMINATED);
-        assert_eq!(count, make_union().count_including_deleted());
-    }
-
-    #[test]
-    fn test_union() {
-        aux_test_union(vec![
-            vec![1, 3333, 100000000u32],
-            vec![1, 2, 100000000u32],
-            vec![1, 2, 100000000u32],
-            vec![],
-        ]);
-        aux_test_union(vec![
-            vec![1, 3333, 100000000u32],
-            vec![1, 2, 100000000u32],
-            vec![1, 2, 100000000u32],
-            vec![],
-        ]);
-        aux_test_union(vec![
-            tests::sample_with_seed(100_000, 0.01, 1),
-            tests::sample_with_seed(100_000, 0.05, 2),
-            tests::sample_with_seed(100_000, 0.001, 3),
-        ]);
-    }
-
-    fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
-        let mut btree_set = BTreeSet::new();
-        for docs in docs_list {
-            btree_set.extend(docs.iter().cloned());
-        }
-        let docset_factory = || {
-            let res: Box<dyn DocSet> = Box::new(Union::build(
-                docs_list
-                    .iter()
-                    .cloned()
-                    .map(VecDocSet::from)
-                    .map(|docset| ConstScorer::new(docset, 1.0))
-                    .collect::<Vec<_>>(),
-                DoNothingCombiner::default,
-            ));
-            res
-        };
-        let mut docset = docset_factory();
-        for el in btree_set {
-            assert_eq!(el, docset.doc());
-            docset.advance();
-        }
-        assert_eq!(docset.doc(), TERMINATED);
-        test_skip_against_unoptimized(docset_factory, skip_targets);
-    }
-
-    #[test]
-    fn test_union_skip_corner_case() {
-        test_aux_union_skip(&[vec![165132, 167382], vec![25029, 25091]], vec![25029]);
-    }
-
-    #[test]
-    fn test_union_skip_corner_case2() {
-        test_aux_union_skip(
-            &[vec![1u32, 1u32 + HORIZON], vec![2u32, 1000u32, 10_000u32]],
-            vec![0u32, 1u32, 2u32, 3u32, 1u32 + HORIZON, 2u32 + HORIZON],
-        );
-    }
-
-    #[test]
-    fn test_union_skip_corner_case3() {
-        let mut docset = Union::build(
-            vec![
-                ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
-                ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
-            ],
-            DoNothingCombiner::default,
-        );
-        assert_eq!(docset.doc(), 0u32);
-        assert_eq!(docset.seek(0u32), 0u32);
-        assert_eq!(docset.seek(0u32), 0u32);
-        assert_eq!(docset.doc(), 0u32)
-    }
-
-    #[test]
-    fn test_union_skip_random() {
-        test_aux_union_skip(
-            &[
-                vec![1, 2, 3, 7],
-                vec![1, 3, 9, 10000],
-                vec![1, 3, 8, 9, 100],
-            ],
-            vec![1, 2, 3, 5, 6, 7, 8, 100],
-        );
-        test_aux_union_skip(
-            &[
-                tests::sample_with_seed(100_000, 0.001, 1),
-                tests::sample_with_seed(100_000, 0.002, 2),
-                tests::sample_with_seed(100_000, 0.005, 3),
-            ],
-            tests::sample_with_seed(100_000, 0.01, 4),
-        );
-    }
-
-    #[test]
-    fn test_union_skip_specific() {
-        test_aux_union_skip(
-            &[
-                vec![1, 2, 3, 7],
-                vec![1, 3, 9, 10000],
-                vec![1, 3, 8, 9, 100],
-            ],
-            vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
-        );
-    }
-}
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-
-    use test::Bencher;
-
-    use crate::query::score_combiner::DoNothingCombiner;
-    use crate::query::{ConstScorer, Union, VecDocSet};
-    use crate::{tests, DocId, DocSet, TERMINATED};
-
-    #[bench]
-    fn bench_union_3_high(bench: &mut Bencher) {
-        let union_docset: Vec<Vec<DocId>> = vec![
-            tests::sample_with_seed(100_000, 0.1, 0),
-            tests::sample_with_seed(100_000, 0.2, 1),
-        ];
-        bench.iter(|| {
-            let mut v = Union::build(
-                union_docset
-                    .iter()
-                    .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
-                    .map(|docset| ConstScorer::new(docset, 1.0))
-                    .collect::<Vec<_>>(),
-                DoNothingCombiner::default,
-            );
-            while v.doc() != TERMINATED {
-                v.advance();
-            }
-        });
-    }
-    #[bench]
-    fn bench_union_3_low(bench: &mut Bencher) {
-        let union_docset: Vec<Vec<DocId>> = vec![
-            tests::sample_with_seed(100_000, 0.01, 0),
-            tests::sample_with_seed(100_000, 0.05, 1),
-            tests::sample_with_seed(100_000, 0.001, 2),
-        ];
-        bench.iter(|| {
-            let mut v = Union::build(
-                union_docset
-                    .iter()
-                    .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
-                    .map(|docset| ConstScorer::new(docset, 1.0))
-                    .collect::<Vec<_>>(),
-                DoNothingCombiner::default,
-            );
-            while v.doc() != TERMINATED {
-                v.advance();
-            }
-        });
-    }
-}
--- a/src/query/union/mod.rs
+++ b/src/query/union/mod.rs
@@ -0,0 +1,303 @@
+mod bitset_union;
+mod buffered_union;
+mod simple_union;
+
+pub use bitset_union::BitSetPostingUnion;
+pub use buffered_union::BufferedUnionScorer;
+pub use simple_union::SimpleUnion;
+
+#[cfg(test)]
+mod tests {
+
+    use std::collections::BTreeSet;
+
+    use common::BitSet;
+
+    use super::{SimpleUnion, *};
+    use crate::docset::{DocSet, TERMINATED};
+    use crate::postings::tests::test_skip_against_unoptimized;
+    use crate::query::score_combiner::DoNothingCombiner;
+    use crate::query::union::bitset_union::BitSetPostingUnion;
+    use crate::query::{BitSetDocSet, ConstScorer, VecDocSet};
+    use crate::{tests, DocId};
+
+    fn vec_doc_set_from_docs_list(
+        docs_list: &[Vec<DocId>],
+    ) -> impl Iterator<Item = VecDocSet> + '_ {
+        docs_list.iter().cloned().map(VecDocSet::from)
+    }
+    fn union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
+        Box::new(BufferedUnionScorer::build(
+            vec_doc_set_from_docs_list(docs_list)
+                .map(|docset| ConstScorer::new(docset, 1.0))
+                .collect::<Vec<ConstScorer<VecDocSet>>>(),
+            DoNothingCombiner::default,
+        ))
+    }
+
+    fn posting_list_union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
+        Box::new(BitSetPostingUnion::build(
+            vec_doc_set_from_docs_list(docs_list).collect::<Vec<VecDocSet>>(),
+            bitset_from_docs_list(docs_list),
+        ))
+    }
+    fn simple_union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
+        Box::new(SimpleUnion::build(
+            vec_doc_set_from_docs_list(docs_list).collect::<Vec<VecDocSet>>(),
+        ))
+    }
+    fn bitset_from_docs_list(docs_list: &[Vec<DocId>]) -> BitSetDocSet {
+        let max_doc = docs_list
+            .iter()
+            .flat_map(|docs| docs.iter().copied())
+            .max()
+            .unwrap_or(0);
+        let mut doc_bitset = BitSet::with_max_value(max_doc + 1);
+        for docs in docs_list {
+            for &doc in docs {
+                doc_bitset.insert(doc);
+            }
+        }
+        BitSetDocSet::from(doc_bitset)
+    }
+    fn aux_test_union(docs_list: &[Vec<DocId>]) {
+        for constructor in [
+            posting_list_union_from_docs_list,
+            simple_union_from_docs_list,
+            union_from_docs_list,
+        ] {
+            aux_test_union_with_constructor(constructor, docs_list);
+        }
+    }
+    fn aux_test_union_with_constructor<F>(constructor: F, docs_list: &[Vec<DocId>])
+    where F: Fn(&[Vec<DocId>]) -> Box<dyn DocSet> {
+        let mut val_set: BTreeSet<u32> = BTreeSet::new();
+        for vs in docs_list {
+            for &v in vs {
+                val_set.insert(v);
+            }
+        }
+        let union_vals: Vec<u32> = val_set.into_iter().collect();
+        let mut union_expected = VecDocSet::from(union_vals);
+        let make_union = || constructor(docs_list);
+        let mut union = make_union();
+        let mut count = 0;
+        while union.doc() != TERMINATED {
+            assert_eq!(union_expected.doc(), union.doc());
+            assert_eq!(union_expected.advance(), union.advance());
+            count += 1;
+        }
+        assert_eq!(union_expected.advance(), TERMINATED);
+        assert_eq!(count, make_union().count_including_deleted());
+    }
+
+    use proptest::prelude::*;
+
+    proptest! {
+        #[test]
+        fn test_union_is_same(vecs in prop::collection::vec(
+            prop::collection::vec(0u32..100, 1..10)
+                .prop_map(|mut inner| {
+                    inner.sort_unstable();
+                    inner.dedup();
+                    inner
+                }),
+            1..10
+        ),
+        seek_docids in prop::collection::vec(0u32..100, 0..10).prop_map(|mut inner| {
+            inner.sort_unstable();
+            inner
+        })) {
+            test_docid_with_skip(&vecs, &seek_docids);
+        }
+    }
+
+    fn test_docid_with_skip(vecs: &[Vec<DocId>], skip_targets: &[DocId]) {
+        let mut union1 = posting_list_union_from_docs_list(vecs);
+        let mut union2 = simple_union_from_docs_list(vecs);
+        let mut union3 = union_from_docs_list(vecs);
+
+        // Check initial sequential advance
+        while union1.doc() != TERMINATED {
+            assert_eq!(union1.doc(), union2.doc());
+            assert_eq!(union1.doc(), union3.doc());
+            assert_eq!(union1.advance(), union2.advance());
+            assert_eq!(union1.doc(), union3.advance());
+        }
+
+        // Reset and test seek functionality
+        let mut union1 = posting_list_union_from_docs_list(vecs);
+        let mut union2 = simple_union_from_docs_list(vecs);
+        let mut union3 = union_from_docs_list(vecs);
+
+        for &seek_docid in skip_targets {
+            union1.seek(seek_docid);
+            union2.seek(seek_docid);
+            union3.seek(seek_docid);
+
+            // Verify that all unions have the same document after seeking
+            assert_eq!(union3.doc(), union1.doc());
+            assert_eq!(union3.doc(), union2.doc());
+        }
+    }
+
+    #[test]
+    fn test_union() {
+        aux_test_union(&[
+            vec![1, 3333, 100000000u32],
+            vec![1, 2, 100000000u32],
+            vec![1, 2, 100000000u32],
+            vec![],
+        ]);
+        aux_test_union(&[
+            vec![1, 3333, 100000000u32],
+            vec![1, 2, 100000000u32],
+            vec![1, 2, 100000000u32],
+            vec![],
+        ]);
+        aux_test_union(&[
+            tests::sample_with_seed(100_000, 0.01, 1),
+            tests::sample_with_seed(100_000, 0.05, 2),
+            tests::sample_with_seed(100_000, 0.001, 3),
+        ]);
+    }
+
+    fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
+        for constructor in [
+            posting_list_union_from_docs_list,
+            simple_union_from_docs_list,
+            union_from_docs_list,
+        ] {
+            test_aux_union_skip_with_constructor(constructor, docs_list, skip_targets.clone());
+        }
+    }
+    fn test_aux_union_skip_with_constructor<F>(
+        constructor: F,
+        docs_list: &[Vec<DocId>],
+        skip_targets: Vec<DocId>,
+    ) where
+        F: Fn(&[Vec<DocId>]) -> Box<dyn DocSet>,
+    {
+        let mut btree_set = BTreeSet::new();
+        for docs in docs_list {
+            btree_set.extend(docs.iter().cloned());
+        }
+        let docset_factory = || {
+            let res: Box<dyn DocSet> = constructor(docs_list);
+            res
+        };
+        let mut docset = constructor(docs_list);
+        for el in btree_set {
+            assert_eq!(el, docset.doc());
+            docset.advance();
+        }
+        assert_eq!(docset.doc(), TERMINATED);
+        test_skip_against_unoptimized(docset_factory, skip_targets);
+    }
+
+    #[test]
+    fn test_union_skip_corner_case() {
+        test_aux_union_skip(&[vec![165132, 167382], vec![25029, 25091]], vec![25029]);
+    }
+
+    #[test]
+    fn test_union_skip_corner_case2() {
+        test_aux_union_skip(
+            &[vec![1u32, 1u32 + 100], vec![2u32, 1000u32, 10_000u32]],
+            vec![0u32, 1u32, 2u32, 3u32, 1u32 + 100, 2u32 + 100],
+        );
+    }
+
+    #[test]
+    fn test_union_skip_corner_case3() {
+        let mut docset = posting_list_union_from_docs_list(&[vec![0u32, 5u32], vec![1u32, 4u32]]);
+        assert_eq!(docset.doc(), 0u32);
+        assert_eq!(docset.seek(0u32), 0u32);
+        assert_eq!(docset.seek(0u32), 0u32);
+        assert_eq!(docset.doc(), 0u32)
+    }
+
+    #[test]
+    fn test_union_skip_random() {
+        test_aux_union_skip(
+            &[
+                vec![1, 2, 3, 7],
+                vec![1, 3, 9, 10000],
+                vec![1, 3, 8, 9, 100],
+            ],
+            vec![1, 2, 3, 5, 6, 7, 8, 100],
+        );
+        test_aux_union_skip(
+            &[
+                tests::sample_with_seed(100_000, 0.001, 1),
+                tests::sample_with_seed(100_000, 0.002, 2),
+                tests::sample_with_seed(100_000, 0.005, 3),
+            ],
+            tests::sample_with_seed(100_000, 0.01, 4),
+        );
+    }
+
+    #[test]
+    fn test_union_skip_specific() {
+        test_aux_union_skip(
+            &[
+                vec![1, 2, 3, 7],
+                vec![1, 3, 9, 10000],
+                vec![1, 3, 8, 9, 100],
+            ],
+            vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
+        );
+    }
+}
+
+#[cfg(all(test, feature = "unstable"))]
+mod bench {
+
+    use test::Bencher;
+
+    use crate::query::score_combiner::DoNothingCombiner;
+    use crate::query::{BufferedUnionScorer, ConstScorer, VecDocSet};
+    use crate::{tests, DocId, DocSet, TERMINATED};
+
+    #[bench]
+    fn bench_union_3_high(bench: &mut Bencher) {
+        let union_docset: Vec<Vec<DocId>> = vec![
+            tests::sample_with_seed(100_000, 0.1, 0),
+            tests::sample_with_seed(100_000, 0.2, 1),
+        ];
+        bench.iter(|| {
+            let mut v = BufferedUnionScorer::build(
+                union_docset
+                    .iter()
+                    .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
+                    .map(|docset| ConstScorer::new(docset, 1.0))
+                    .collect::<Vec<_>>(),
+                DoNothingCombiner::default,
+            );
+            while v.doc() != TERMINATED {
+                v.advance();
+            }
+        });
+    }
+    #[bench]
+    fn bench_union_3_low(bench: &mut Bencher) {
+        let union_docset: Vec<Vec<DocId>> = vec![
+            tests::sample_with_seed(100_000, 0.01, 0),
+            tests::sample_with_seed(100_000, 0.05, 1),
+            tests::sample_with_seed(100_000, 0.001, 2),
+        ];
+        bench.iter(|| {
+            let mut v = BufferedUnionScorer::build(
+                union_docset
+                    .iter()
+                    .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
+                    .map(|docset| ConstScorer::new(docset, 1.0))
+                    .collect::<Vec<_>>(),
+                DoNothingCombiner::default,
+            );
+            while v.doc() != TERMINATED {
+                v.advance();
+            }
+        });
+    }
+}
--- a/src/query/union/simple_union.rs
+++ b/src/query/union/simple_union.rs
@@ -0,0 +1,112 @@
+use crate::docset::{DocSet, TERMINATED};
+use crate::postings::Postings;
+use crate::DocId;
+
+/// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`.
+/// Unlike `BufferedUnion`, it doesn't do any horizon precomputation.
+///
+/// For that reason SimpleUnion is a good choice for queries that skip a lot.
+pub struct SimpleUnion<TDocSet> {
+    docsets: Vec<TDocSet>,
+    doc: DocId,
+}
+
+impl<TDocSet: DocSet> SimpleUnion<TDocSet> {
+    pub(crate) fn build(mut docsets: Vec<TDocSet>) -> SimpleUnion<TDocSet> {
+        docsets.retain(|docset| docset.doc() != TERMINATED);
+        let mut docset = SimpleUnion { docsets, doc: 0 };
+
+        docset.initialize_first_doc_id();
+
+        docset
+    }
+
+    fn initialize_first_doc_id(&mut self) {
+        let mut next_doc = TERMINATED;
+
+        for docset in &self.docsets {
+            next_doc = next_doc.min(docset.doc());
+        }
+        self.doc = next_doc;
+    }
+
+    fn advance_to_next(&mut self) -> DocId {
+        let mut next_doc = TERMINATED;
+
+        for docset in &mut self.docsets {
+            if docset.doc() <= self.doc {
+                docset.advance();
+            }
+            next_doc = next_doc.min(docset.doc());
+        }
+        self.doc = next_doc;
+        self.doc
+    }
+}
+
+impl<TDocSet: Postings> Postings for SimpleUnion<TDocSet> {
+    fn term_freq(&self) -> u32 {
+        let mut term_freq = 0;
+        for docset in &self.docsets {
+            let doc = docset.doc();
+            if doc == self.doc {
+                term_freq += docset.term_freq();
+            }
+        }
+        term_freq
+    }
+
+    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
+        for docset in &mut self.docsets {
+            let doc = docset.doc();
+            if doc == self.doc {
+                docset.append_positions_with_offset(offset, output);
+            }
+        }
+        output.sort_unstable();
+        output.dedup();
+    }
+}
+
+impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
+    fn advance(&mut self) -> DocId {
+        self.advance_to_next();
+        self.doc
+    }
+
+    fn seek(&mut self, target: DocId) -> DocId {
+        self.doc = TERMINATED;
+        for docset in &mut self.docsets {
+            if docset.doc() < target {
+                docset.seek(target);
+            }
+            if docset.doc() < self.doc {
+                self.doc = docset.doc();
+            }
+        }
+        self.doc
+    }
+
+    fn doc(&self) -> DocId {
+        self.doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.docsets
+            .iter()
+            .map(|docset| docset.size_hint())
+            .max()
+            .unwrap_or(0u32)
+    }
+
+    fn count_including_deleted(&mut self) -> u32 {
+        if self.doc == TERMINATED {
+            return 0u32;
+        }
+        let mut count = 1u32;
+        while self.advance_to_next() != TERMINATED {
+            count += 1;
+        }
+        count
+    }
+}
--- a/src/query/vec_docset.rs
+++ b/src/query/vec_docset.rs
@@ -50,7 +50,7 @@ impl HasLen for VecDocSet {
 }

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use super::*;
    use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
--- a/src/schema/document/de.rs
+++ b/src/schema/document/de.rs
@@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer;
 use super::{OwnedValue, Value};
 use crate::schema::document::type_codes;
 use crate::schema::{Facet, Field};
+use crate::store::DocStoreVersion;
 use crate::tokenizer::PreTokenizedString;

 #[derive(Debug, thiserror::Error, Clone)]
@@ -45,6 +46,9 @@ pub enum DeserializeError {
    #[error("{0}")]
    /// A custom error message.
    Custom(String),
+    #[error("Version {0}, Max version supported: {1}")]
+    /// Unsupported version error.
+    UnsupportedVersion(u32, u32),
 }

 impl DeserializeError {
@@ -291,6 +295,7 @@ pub trait ObjectAccess<'de> {
 pub struct BinaryDocumentDeserializer<'de, R> {
    length: usize,
    position: usize,
+    doc_store_version: DocStoreVersion,
    reader: &'de mut R,
 }

@@ -298,12 +303,16 @@ impl<'de, R> BinaryDocumentDeserializer<'de, R>
 where R: Read
 {
    /// Attempts to create a new document deserializer from a given reader.
-    pub(crate) fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
+    pub(crate) fn from_reader(
+        reader: &'de mut R,
+        doc_store_version: DocStoreVersion,
+    ) -> Result<Self, DeserializeError> {
        let length = VInt::deserialize(reader)?;

        Ok(Self {
            length: length.val() as usize,
            position: 0,
+            doc_store_version,
            reader,
        })
    }
@@ -329,8 +338,8 @@ where R: Read
        }

        let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?;
-
-        let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
+        let deserializer =
+            BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
        let value = V::deserialize(deserializer)?;

        self.position += 1;
@@ -344,13 +353,17 @@ where R: Read
 pub struct BinaryValueDeserializer<'de, R> {
    value_type: ValueType,
    reader: &'de mut R,
+    doc_store_version: DocStoreVersion,
 }

 impl<'de, R> BinaryValueDeserializer<'de, R>
 where R: Read
 {
    /// Attempts to create a new value deserializer from a given reader.
-    fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
+    fn from_reader(
+        reader: &'de mut R,
+        doc_store_version: DocStoreVersion,
+    ) -> Result<Self, DeserializeError> {
        let type_code = <u8 as BinarySerializable>::deserialize(reader)?;

        let value_type = match type_code {
@@ -381,7 +394,7 @@ where R: Read
            type_codes::NULL_CODE => ValueType::Null,
            type_codes::ARRAY_CODE => ValueType::Array,
            type_codes::OBJECT_CODE => ValueType::Object,
-            #[allow(deprecated)]
+            #[expect(deprecated)]
            type_codes::JSON_OBJ_CODE => ValueType::JSONObject,
            _ => {
                return Err(DeserializeError::from(io::Error::new(
@@ -391,7 +404,11 @@ where R: Read
            }
        };

-        Ok(Self { value_type, reader })
+        Ok(Self {
+            value_type,
+            reader,
+            doc_store_version,
+        })
    }

    fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> {
@@ -438,7 +455,16 @@ where R: Read

    fn deserialize_datetime(self) -> Result<DateTime, DeserializeError> {
        self.validate_type(ValueType::DateTime)?;
-        <DateTime as BinarySerializable>::deserialize(self.reader).map_err(DeserializeError::from)
+        match self.doc_store_version {
+            DocStoreVersion::V1 => {
+                let timestamp_micros = <i64 as BinarySerializable>::deserialize(self.reader)?;
+                Ok(DateTime::from_timestamp_micros(timestamp_micros))
+            }
+            DocStoreVersion::V2 => {
+                let timestamp_nanos = <i64 as BinarySerializable>::deserialize(self.reader)?;
+                Ok(DateTime::from_timestamp_nanos(timestamp_nanos))
+            }
+        }
    }

    fn deserialize_facet(self) -> Result<Facet, DeserializeError> {
@@ -514,11 +540,13 @@ where R: Read
                visitor.visit_pre_tokenized_string(val)
            }
            ValueType::Array => {
-                let access = BinaryArrayDeserializer::from_reader(self.reader)?;
+                let access =
+                    BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?;
                visitor.visit_array(access)
            }
            ValueType::Object => {
-                let access = BinaryObjectDeserializer::from_reader(self.reader)?;
+                let access =
+                    BinaryObjectDeserializer::from_reader(self.reader, self.doc_store_version)?;
                visitor.visit_object(access)
            }
            #[allow(deprecated)]
@@ -537,7 +565,8 @@ where R: Read

                let out_rc = std::rc::Rc::new(out);
                let mut slice: &[u8] = &out_rc;
-                let access = BinaryObjectDeserializer::from_reader(&mut slice)?;
+                let access =
+                    BinaryObjectDeserializer::from_reader(&mut slice, self.doc_store_version)?;

                visitor.visit_object(access)
            }
@@ -551,19 +580,24 @@ pub struct BinaryArrayDeserializer<'de, R> {
    length: usize,
    position: usize,
    reader: &'de mut R,
+    doc_store_version: DocStoreVersion,
 }

 impl<'de, R> BinaryArrayDeserializer<'de, R>
 where R: Read
 {
    /// Attempts to create a new array deserializer from a given reader.
-    fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
+    fn from_reader(
+        reader: &'de mut R,
+        doc_store_version: DocStoreVersion,
+    ) -> Result<Self, DeserializeError> {
        let length = <VInt as BinarySerializable>::deserialize(reader)?;

        Ok(Self {
            length: length.val() as usize,
            position: 0,
            reader,
+            doc_store_version,
        })
    }

@@ -587,7 +621,8 @@ where R: Read
            return Ok(None);
        }

-        let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
+        let deserializer =
+            BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
        let value = V::deserialize(deserializer)?;

        // Advance the position cursor.
@@ -610,8 +645,11 @@ impl<'de, R> BinaryObjectDeserializer<'de, R>
 where R: Read
 {
    /// Attempts to create a new object deserializer from a given reader.
-    fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
-        let inner = BinaryArrayDeserializer::from_reader(reader)?;
+    fn from_reader(
+        reader: &'de mut R,
+        doc_store_version: DocStoreVersion,
+    ) -> Result<Self, DeserializeError> {
+        let inner = BinaryArrayDeserializer::from_reader(reader, doc_store_version)?;
        Ok(Self { inner })
    }
 }
@@ -819,6 +857,7 @@ mod tests {
    use crate::schema::document::existing_type_impls::JsonObjectIter;
    use crate::schema::document::se::BinaryValueSerializer;
    use crate::schema::document::{ReferenceValue, ReferenceValueLeaf};
+    use crate::store::DOC_STORE_VERSION;

    fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
        let mut writer = Vec::new();
@@ -829,9 +868,19 @@ mod tests {
        writer
    }

+    fn serialize_owned_value<'a>(value: ReferenceValue<'a, &'a OwnedValue>) -> Vec<u8> {
+        let mut writer = Vec::new();
+
+        let mut serializer = BinaryValueSerializer::new(&mut writer);
+        serializer.serialize_value(value).expect("Serialize value");
+
+        writer
+    }
+
    fn deserialize_value(buffer: Vec<u8>) -> crate::schema::OwnedValue {
        let mut cursor = Cursor::new(buffer);
-        let deserializer = BinaryValueDeserializer::from_reader(&mut cursor).unwrap();
+        let deserializer =
+            BinaryValueDeserializer::from_reader(&mut cursor, DOC_STORE_VERSION).unwrap();
        crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value")
    }

@@ -1010,6 +1059,17 @@ mod tests {
        assert_eq!(value, expected_val);
    }

+    #[test]
+    fn test_nested_date_precision() {
+        let object = OwnedValue::Object(vec![(
+            "my-date".into(),
+            OwnedValue::Date(DateTime::from_timestamp_nanos(323456)),
+        )]);
+        let result = serialize_owned_value((&object).as_value());
+        let value = deserialize_value(result);
+        assert_eq!(value, object);
+    }
+
    #[test]
    fn test_nested_serialize() {
        let mut object = serde_json::Map::new();
--- a/src/schema/document/default_document.rs
+++ b/src/schema/document/default_document.rs
@@ -401,7 +401,7 @@ impl PartialEq for CompactDocValue<'_> {
        value1 == value2
    }
 }
-impl<'a> From<CompactDocValue<'a>> for OwnedValue {
+impl From<CompactDocValue<'_>> for OwnedValue {
    fn from(value: CompactDocValue) -> Self {
        value.as_value().into()
    }
--- a/src/schema/document/se.rs
+++ b/src/schema/document/se.rs
@@ -81,6 +81,15 @@ where W: Write
        Self { writer }
    }

+    fn serialize_with_type_code<T: BinarySerializable>(
+        &mut self,
+        code: u8,
+        val: &T,
+    ) -> io::Result<()> {
+        self.write_type_code(code)?;
+        BinarySerializable::serialize(val, self.writer)
+    }
+
    /// Attempts to serialize a given value and write the output
    /// to the writer.
    pub(crate) fn serialize_value<'a, V>(
@@ -94,56 +103,38 @@ where W: Write
            ReferenceValue::Leaf(leaf) => match leaf {
                ReferenceValueLeaf::Null => self.write_type_code(type_codes::NULL_CODE),
                ReferenceValueLeaf::Str(val) => {
-                    self.write_type_code(type_codes::TEXT_CODE)?;
-
-                    let temp_val = Cow::Borrowed(val);
-                    temp_val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::TEXT_CODE, &Cow::Borrowed(val))
                }
                ReferenceValueLeaf::U64(val) => {
-                    self.write_type_code(type_codes::U64_CODE)?;
-
-                    val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::U64_CODE, &val)
                }
                ReferenceValueLeaf::I64(val) => {
-                    self.write_type_code(type_codes::I64_CODE)?;
-
-                    val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::I64_CODE, &val)
                }
                ReferenceValueLeaf::F64(val) => {
-                    self.write_type_code(type_codes::F64_CODE)?;
-
-                    f64_to_u64(val).serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::F64_CODE, &f64_to_u64(val))
                }
                ReferenceValueLeaf::Date(val) => {
                    self.write_type_code(type_codes::DATE_CODE)?;
-                    val.serialize(self.writer)
-                }
-                ReferenceValueLeaf::Facet(val) => {
-                    self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
-
-                    Cow::Borrowed(val).serialize(self.writer)
+                    let timestamp_nanos: i64 = val.into_timestamp_nanos();
+                    BinarySerializable::serialize(&timestamp_nanos, self.writer)
                }
+                ReferenceValueLeaf::Facet(val) => self.serialize_with_type_code(
+                    type_codes::HIERARCHICAL_FACET_CODE,
+                    &Cow::Borrowed(val),
+                ),
                ReferenceValueLeaf::Bytes(val) => {
-                    self.write_type_code(type_codes::BYTES_CODE)?;
-
-                    let temp_val = Cow::Borrowed(val);
-                    temp_val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::BYTES_CODE, &Cow::Borrowed(val))
                }
                ReferenceValueLeaf::IpAddr(val) => {
-                    self.write_type_code(type_codes::IP_CODE)?;
-
-                    val.to_u128().serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::IP_CODE, &val.to_u128())
                }
                ReferenceValueLeaf::Bool(val) => {
-                    self.write_type_code(type_codes::BOOL_CODE)?;
-
-                    val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::BOOL_CODE, &val)
                }
                ReferenceValueLeaf::PreTokStr(val) => {
                    self.write_type_code(type_codes::EXT_CODE)?;
-                    self.write_type_code(type_codes::TOK_STR_EXT_CODE)?;
-
-                    val.serialize(self.writer)
+                    self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val)
                }
            },
            ReferenceValue::Array(elements) => {
@@ -306,7 +297,6 @@ where W: Write
 mod tests {
    use std::collections::BTreeMap;

-    use common::DateTime;
    use serde_json::Number;
    use tokenizer_api::Token;

@@ -337,7 +327,10 @@ mod tests {
                    $ext_code.serialize(&mut writer).unwrap();
                )?

-                $value.serialize(&mut writer).unwrap();
+                BinarySerializable::serialize(
+                    &$value,
+                    &mut writer,
+                ).unwrap();
            )*

            writer
@@ -355,7 +348,10 @@ mod tests {
                    $ext_code.serialize(&mut writer).unwrap();
                )?

-                $value.serialize(&mut writer).unwrap();
+                BinarySerializable::serialize(
+                    &$value,
+                    &mut writer,
+                ).unwrap();
            )*

            writer
@@ -418,15 +414,6 @@ mod tests {
            "Expected serialized value to match the binary representation"
        );

-        let result = serialize_value(ReferenceValueLeaf::Date(DateTime::MAX).into());
-        let expected = binary_repr!(
-            type_codes::DATE_CODE => DateTime::MAX,
-        );
-        assert_eq!(
-            result, expected,
-            "Expected serialized value to match the binary representation"
-        );
-
        let facet = Facet::from_text("/hello/world").unwrap();
        let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
        let expected = binary_repr!(
--- a/src/schema/facet.rs
+++ b/src/schema/facet.rs
@@ -4,7 +4,7 @@ use std::io::{self, Read, Write};
 use std::str;
 use std::string::FromUtf8Error;

-use common::BinarySerializable;
+use common::*;
 use once_cell::sync::Lazy;
 use regex::Regex;
 use serde::de::Error as _;
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -331,6 +331,7 @@ where B: AsRef<[u8]>
 }

 /// ValueBytes represents a serialized value.
+///
 /// The value can be of any type of [`Type`] (e.g. string, u64, f64, bool, date, JSON).
 /// The serialized representation matches the lexicographical order of the type.
 ///
--- a/src/snippet/mod.rs
+++ b/src/snippet/mod.rs
@@ -1,5 +1,6 @@
 //! [`SnippetGenerator`]
 //! Generates a text snippet for a given document, and some highlighted parts inside it.
+//!
 //! Imagine you doing a text search in a document
 //! and want to show a preview of where in the document the search terms occur,
 //! along with some surrounding text to give context, and the search terms highlighted.
@@ -436,7 +437,7 @@ impl SnippetGenerator {
    }

    #[cfg(test)]
-    pub fn terms_text(&self) -> &BTreeMap<String, Score> {
+    pub(crate) fn terms_text(&self) -> &BTreeMap<String, Score> {
        &self.terms_text
    }

--- a/src/space_usage/mod.rs
+++ b/src/space_usage/mod.rs
@@ -78,7 +78,7 @@ pub struct SegmentSpaceUsage {
 }

 impl SegmentSpaceUsage {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
    pub(crate) fn new(
        num_docs: u32,
        termdict: PerFieldSpaceUsage,
--- a/src/store/compression_lz4_block.rs
+++ b/src/store/compression_lz4_block.rs
@@ -4,7 +4,7 @@ use std::mem;
 use lz4_flex::{compress_into, decompress_into};

 #[inline]
-#[allow(clippy::uninit_vec)]
+#[expect(clippy::uninit_vec)]
 pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
    compressed.clear();
    let maximum_output_size =
@@ -24,7 +24,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
 }

 #[inline]
-#[allow(clippy::uninit_vec)]
+#[expect(clippy::uninit_vec)]
 pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
    decompressed.clear();
    let uncompressed_size_bytes: &[u8; 4] = compressed
--- a/src/store/footer.rs
+++ b/src/store/footer.rs
@@ -2,12 +2,13 @@ use std::io;

 use common::{BinarySerializable, FixedSize, HasLen};

-use super::{Decompressor, DOC_STORE_VERSION};
+use super::{Decompressor, DocStoreVersion, DOC_STORE_VERSION};
 use crate::directory::FileSlice;

 #[derive(Debug, Clone, PartialEq)]
 pub struct DocStoreFooter {
    pub offset: u64,
+    pub doc_store_version: DocStoreVersion,
    pub decompressor: Decompressor,
 }

@@ -25,9 +26,11 @@ impl BinarySerializable for DocStoreFooter {
    }

    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let doc_store_version = u32::deserialize(reader)?;
-        if doc_store_version != DOC_STORE_VERSION {
-            panic!("actual doc store version: {doc_store_version}, expected: {DOC_STORE_VERSION}");
+        let doc_store_version = DocStoreVersion::deserialize(reader)?;
+        if doc_store_version > DOC_STORE_VERSION {
+            panic!(
+                "actual doc store version: {doc_store_version}, max_supported: {DOC_STORE_VERSION}"
+            );
        }
        let offset = u64::deserialize(reader)?;
        let compressor_id = u8::deserialize(reader)?;
@@ -35,6 +38,7 @@ impl BinarySerializable for DocStoreFooter {
        reader.read_exact(&mut skip_buf)?;
        Ok(DocStoreFooter {
            offset,
+            doc_store_version,
            decompressor: Decompressor::from_id(compressor_id),
        })
    }
@@ -45,9 +49,14 @@ impl FixedSize for DocStoreFooter {
 }

 impl DocStoreFooter {
-    pub fn new(offset: u64, decompressor: Decompressor) -> Self {
+    pub fn new(
+        offset: u64,
+        decompressor: Decompressor,
+        doc_store_version: DocStoreVersion,
+    ) -> Self {
        DocStoreFooter {
            offset,
+            doc_store_version,
            decompressor,
        }
    }
--- a/src/store/index/skip_index.rs
+++ b/src/store/index/skip_index.rs
@@ -11,7 +11,7 @@ pub struct LayerCursor<'a> {
    cursor: usize,
 }

-impl<'a> Iterator for LayerCursor<'a> {
+impl Iterator for LayerCursor<'_> {
    type Item = Checkpoint;

    fn next(&mut self) -> Option<Checkpoint> {
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -35,15 +35,16 @@ mod footer;
 mod index;
 mod reader;
 mod writer;
+
 pub use self::compressors::{Compressor, ZstdCompressor};
 pub use self::decompressors::Decompressor;
-pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY;
 pub use self::reader::{CacheStats, StoreReader};
+pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY};
 pub use self::writer::StoreWriter;
 mod store_compressor;

 /// Doc store version in footer to handle format changes.
-pub(crate) const DOC_STORE_VERSION: u32 = 1;
+pub(crate) const DOC_STORE_VERSION: DocStoreVersion = DocStoreVersion::V2;

 #[cfg(feature = "lz4-compression")]
 mod compression_lz4_block;
@@ -52,7 +53,7 @@ mod compression_lz4_block;
 mod compression_zstd_block;

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {

    use std::path::Path;

--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -1,3 +1,4 @@
+use std::fmt::Display;
 use std::io;
 use std::iter::Sum;
 use std::num::NonZeroUsize;
@@ -25,9 +26,43 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;

 type Block = OwnedBytes;

+/// The format version of the document store.
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub(crate) enum DocStoreVersion {
+    V1 = 1,
+    V2 = 2,
+}
+impl Display for DocStoreVersion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DocStoreVersion::V1 => write!(f, "V1"),
+            DocStoreVersion::V2 => write!(f, "V2"),
+        }
+    }
+}
+impl BinarySerializable for DocStoreVersion {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        (*self as u32).serialize(writer)
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        Ok(match u32::deserialize(reader)? {
+            1 => DocStoreVersion::V1,
+            2 => DocStoreVersion::V2,
+            v => {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    format!("Invalid doc store version {}", v),
+                ))
+            }
+        })
+    }
+}
+
 /// Reads document off tantivy's [`Store`](./index.html)
 pub struct StoreReader {
    decompressor: Decompressor,
+    doc_store_version: DocStoreVersion,
    data: FileSlice,
    skip_index: Arc<SkipIndex>,
    space_usage: StoreSpaceUsage,
@@ -129,6 +164,7 @@ impl StoreReader {
        let skip_index = SkipIndex::open(index_data);
        Ok(StoreReader {
            decompressor: footer.decompressor,
+            doc_store_version: footer.doc_store_version,
            data: data_file,
            cache: BlockCache {
                cache: NonZeroUsize::new(cache_num_blocks)
@@ -203,8 +239,9 @@ impl StoreReader {
    pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
        let mut doc_bytes = self.get_document_bytes(doc_id)?;

-        let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
-            .map_err(crate::TantivyError::from)?;
+        let deserializer =
+            BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
+                .map_err(crate::TantivyError::from)?;
        D::deserialize(deserializer).map_err(crate::TantivyError::from)
    }

@@ -244,8 +281,9 @@ impl StoreReader {
        self.iter_raw(alive_bitset).map(|doc_bytes_res| {
            let mut doc_bytes = doc_bytes_res?;

-            let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
-                .map_err(crate::TantivyError::from)?;
+            let deserializer =
+                BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
+                    .map_err(crate::TantivyError::from)?;
            D::deserialize(deserializer).map_err(crate::TantivyError::from)
        })
    }
@@ -391,8 +429,9 @@ impl StoreReader {
    ) -> crate::Result<D> {
        let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;

-        let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
-            .map_err(crate::TantivyError::from)?;
+        let deserializer =
+            BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
+                .map_err(crate::TantivyError::from)?;
        D::deserialize(deserializer).map_err(crate::TantivyError::from)
    }
 }
@@ -414,6 +453,11 @@ mod tests {
        doc.get_first(*field).and_then(|f| f.as_value().as_str())
    }

+    #[test]
+    fn test_doc_store_version_ord() {
+        assert!(DocStoreVersion::V1 < DocStoreVersion::V2);
+    }
+
    #[test]
    fn test_store_lru_cache() -> crate::Result<()> {
        let directory = RamDirectory::create();
--- a/src/store/store_compressor.rs
+++ b/src/store/store_compressor.rs
@@ -5,6 +5,7 @@ use std::{io, thread};

 use common::{BinarySerializable, CountingWriter, TerminatingWrite};

+use super::DOC_STORE_VERSION;
 use crate::directory::WritePtr;
 use crate::store::footer::DocStoreFooter;
 use crate::store::index::{Checkpoint, SkipIndexBuilder};
@@ -143,8 +144,11 @@ impl BlockCompressorImpl {

    fn close(mut self) -> io::Result<()> {
        let header_offset: u64 = self.writer.written_bytes();
-        let docstore_footer =
-            DocStoreFooter::new(header_offset, Decompressor::from(self.compressor));
+        let docstore_footer = DocStoreFooter::new(
+            header_offset,
+            Decompressor::from(self.compressor),
+            DOC_STORE_VERSION,
+        );
        self.offset_index_writer.serialize_into(&mut self.writer)?;
        docstore_footer.serialize(&mut self.writer)?;
        self.writer.terminate()
--- a/src/termdict/fst_termdict/streamer.rs
+++ b/src/termdict/fst_termdict/streamer.rs
@@ -82,7 +82,7 @@ where A: Automaton
    current_value: TermInfo,
 }

-impl<'a, A> TermStreamer<'a, A>
+impl<A> TermStreamer<'_, A>
 where A: Automaton
 {
    /// Advance position the stream on the next item.
@@ -136,7 +136,7 @@ where A: Automaton
    }

    /// Return the next `(key, value)` pair.
-    #[allow(clippy::should_implement_trait)]
+    #[expect(clippy::should_implement_trait)]
    pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
        if self.advance() {
            Some((self.key(), self.value()))
--- a/src/termdict/mod.rs
+++ b/src/termdict/mod.rs
@@ -49,7 +49,6 @@ use crate::postings::TermInfo;

 #[derive(Debug, Eq, PartialEq)]
 #[repr(u32)]
-#[allow(dead_code)]
 enum DictionaryType {
    Fst = 1,
    SSTable = 2,
--- a/src/tokenizer/ascii_folding_filter.rs
+++ b/src/tokenizer/ascii_folding_filter.rs
@@ -42,7 +42,7 @@ pub struct AsciiFoldingFilterTokenStream<'a, T> {
    tail: T,
 }

-impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
+impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'_, T> {
    fn advance(&mut self) -> bool {
        if !self.tail.advance() {
            return false;
--- a/src/tokenizer/facet_tokenizer.rs
+++ b/src/tokenizer/facet_tokenizer.rs
@@ -40,7 +40,7 @@ impl Tokenizer for FacetTokenizer {
    }
 }

-impl<'a> TokenStream for FacetTokenStream<'a> {
+impl TokenStream for FacetTokenStream<'_> {
    fn advance(&mut self) -> bool {
        match self.state {
            State::RootFacetNotEmitted => {
--- a/src/tokenizer/lower_caser.rs
+++ b/src/tokenizer/lower_caser.rs
@@ -51,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
    }
 }

-impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
+impl<T: TokenStream> TokenStream for LowerCaserTokenStream<'_, T> {
    fn advance(&mut self) -> bool {
        if !self.tail.advance() {
            return false;
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -166,7 +166,7 @@ pub use self::whitespace_tokenizer::WhitespaceTokenizer;
 pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;

 #[cfg(test)]
-pub mod tests {
+pub(crate) mod tests {
    use super::{
        Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
    };
--- a/src/tokenizer/ngram_tokenizer.rs
+++ b/src/tokenizer/ngram_tokenizer.rs
@@ -159,7 +159,7 @@ impl Tokenizer for NgramTokenizer {
    }
 }

-impl<'a> TokenStream for NgramTokenStream<'a> {
+impl TokenStream for NgramTokenStream<'_> {
    fn advance(&mut self) -> bool {
        if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
            if self.prefix_only && offset_from > 0 {
@@ -283,7 +283,7 @@ impl<'a> CodepointFrontiers<'a> {
    }
 }

-impl<'a> Iterator for CodepointFrontiers<'a> {
+impl Iterator for CodepointFrontiers<'_> {
    type Item = usize;

    fn next(&mut self) -> Option<usize> {
--- a/src/tokenizer/raw_tokenizer.rs
+++ b/src/tokenizer/raw_tokenizer.rs
@@ -28,7 +28,7 @@ impl Tokenizer for RawTokenizer {
    }
 }

-impl<'a> TokenStream for RawTokenStream<'a> {
+impl TokenStream for RawTokenStream<'_> {
    fn advance(&mut self) -> bool {
        let result = self.has_token;
        self.has_token = false;
--- a/src/tokenizer/regex_tokenizer.rs
+++ b/src/tokenizer/regex_tokenizer.rs
@@ -4,6 +4,7 @@ use super::{Token, TokenStream, Tokenizer};
 use crate::TantivyError;

 /// Tokenize the text by using a regex pattern to split.
+///
 /// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such
 /// as `\A` will match the text from the part where the last token was emitted or the beginning of
 /// the complete text if no token was emitted yet.
@@ -83,7 +84,7 @@ pub struct RegexTokenStream<'a> {
    cursor: usize,
 }

-impl<'a> TokenStream for RegexTokenStream<'a> {
+impl TokenStream for RegexTokenStream<'_> {
    fn advance(&mut self) -> bool {
        let Some(regex_match) = self.regex.find(self.text) else {
            return false;
--- a/src/tokenizer/simple_tokenizer.rs
+++ b/src/tokenizer/simple_tokenizer.rs
@@ -27,7 +27,7 @@ impl Tokenizer for SimpleTokenizer {
    }
 }

-impl<'a> SimpleTokenStream<'a> {
+impl SimpleTokenStream<'_> {
    // search for the end of the current token.
    fn search_token_end(&mut self) -> usize {
        (&mut self.chars)
@@ -38,7 +38,7 @@ impl<'a> SimpleTokenStream<'a> {
    }
 }

-impl<'a> TokenStream for SimpleTokenStream<'a> {
+impl TokenStream for SimpleTokenStream<'_> {
    fn advance(&mut self) -> bool {
        self.token.text.clear();
        self.token.position = self.token.position.wrapping_add(1);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
dependabot[bot]	ec5748d795	Bump codecov/codecov-action from 3 to 5 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 5. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v3...v5) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-14 20:12:50 +00:00
dependabot[bot]	c71ea7b2ef	Update thiserror requirement from 1.0.30 to 2.0.1 (#2542 ) Updates the requirements on [thiserror](https://github.com/dtolnay/thiserror) to permit the latest version. - [Release notes](https://github.com/dtolnay/thiserror/releases) - [Commits](https://github.com/dtolnay/thiserror/compare/1.0.30...2.0.1) --- updated-dependencies: - dependency-name: thiserror dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-11-09 08:08:34 +08:00
Paul Masurel	c35a782747	Updating rustc-hash and clippy fixes (#2532 ) * Updating rustc-hash and clippy fixes * fix terms_aggregation_min_doc_count_special_case --------- Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>	2024-11-01 13:46:26 +08:00
dependabot[bot]	c66af2c0a9	Update binggan requirement from 0.12.0 to 0.14.0 (#2530 ) * Update binggan requirement from 0.12.0 to 0.14.0 --- updated-dependencies: - dependency-name: binggan dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> * fix build --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>	2024-10-24 09:41:35 +08:00
Joan Antoni RE	f9ac055847	Fix some links in architecture docs (#2528 )	2024-10-23 21:06:54 +09:00
PSeitz	21d057059e	clippy (#2527 ) * clippy * clippy * clippy * clippy * convert allow to expect and remove unused * cargo fmt * cleanup * export sample * clippy	2024-10-22 09:26:54 +08:00
PSeitz	dca508b4ca	remove read_postings_no_deletes (#2526 ) closes #2525	2024-10-22 09:52:43 +09:00
PSeitz	aebae9965d	add RegexPhraseQuery (#2516 ) * add RegexPhraseQuery RegexPhraseQuery supports phrase queries with regex. It supports regex and wildcards. E.g. a query with wildcards: "b* b* wolf" matches "big bad wolf" Slop is supported as well: "b* wolf"~2 matches "big bad wolf" Regex queries may match a lot of terms where we still need to keep track which term hit to load the positions. The phrase query algorithm groups terms by their frequency together in the union to prefilter groups early. This PR comes with some new datastructures: SimpleUnion - A union docset for a list of docsets. It doesn't do any caching and is therefore well suited for datasets with lots of skipping. (phrase search, but intersections in general) LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in memory. SegmentPostings uses 1840 bytes per instance with its caches, which is equivalent to 460 docids. LoadedPostings is used for terms which have less than 100 docs. LoadedPostings is only used to reduce memory consumption. BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid hits and the docsets for positions. The BitSet is the precalculated union of the docsets In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion, before creating a new one. Renamed Union to BufferedUnionScorer Added proptests to test different union types. * cleanup * use Box instead of Vec * use RefCell instead of term_freq(&mut) * remove wildcard mode * move RefCell to outer * clippy	2024-10-21 18:29:17 +08:00
Marvin	e7e3e3f44c	make casing in docs more consistent (#2524 ) * make casing in docs more consistent * more * lowercase tantivy	2024-10-21 17:59:41 +09:00
PSeitz	2f2db16ec1	store DateTime as nanoseconds in doc store (#2486 ) * store DateTime as nanoseconds in doc store The doc store DateTime was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. This is done by adding the trait `ConfigurableBinarySerializable`, which works like `BinarySerializable`, but with a config that allows de/serialize as different date time precision currently. bump version format to 7. add compat test to check the date time truncation. * remove configurable binary serialize, add enum for doc store version * test doc store version ord	2024-10-18 10:50:20 +08:00
Paul Masurel	d152e29687	Fixed citation (#2523 )	2024-10-17 10:19:50 +09:00
Paul Masurel	285bcc25c9	Added citation.cff (#2522 )	2024-10-17 09:43:35 +09:00
PSeitz	7b65ad922d	use binggan for stacker bench (#2492 ) * use binggan for stacker bench ``` alice (num terms: 174693) hashmap Memory: 1.3 MB Avg: 367.19 MiB/s (-1.34%) Median: 368.10 MiB/s (-1.34%) [378.75 MiB/s .. 352.81 MiB/s] hasmap with postings Memory: 2.4 MB Avg: 237.29 MiB/s (-2.19%) Median: 240.22 MiB/s (-1.61%) [248.26 MiB/s .. 210.66 MiB/s] fxhashmap ref postings Memory: 2.9 MB Avg: 171.94 MiB/s (-3.22%) Median: 174.13 MiB/s (-2.69%) [185.94 MiB/s .. 152.43 MiB/s] fxhasmap owned postings Memory: 3.5 MB Avg: 96.993 MiB/s (-4.20%) Median: 97.410 MiB/s (-4.48%) [102.78 MiB/s .. 82.745 MiB/s] numbers unique 100k hashmap Memory: 5.2 MB Avg: 334.17 MiB/s (-3.06%) Median: 352.61 MiB/s (+0.77%) [362.60 MiB/s .. 213.03 MiB/s] hasmap with postings Memory: 6.3 MB Avg: 316.96 MiB/s (-0.02%) Median: 325.16 MiB/s (-0.04%) [338.36 MiB/s .. 218.60 MiB/s] zipfs numbers 100k hashmap Memory: 1.3 MB Avg: 1.2342 GiB/s (+2.87%) Median: 1.2677 GiB/s (+4.66%) [1.3130 GiB/s .. 915.93 MiB/s] hasmap with postings Memory: 2.4 MB Avg: 485.16 MiB/s (+2.68%) Median: 494.70 MiB/s (+4.42%) [505.31 MiB/s .. 413.14 MiB/s] numbers unique 1mio hashmap Memory: 35.7 MB Avg: 169.68 MiB/s (-1.08%) Median: 166.80 MiB/s (-3.87%) [201.33 MiB/s .. 154.26 MiB/s] hasmap with postings Memory: 39.8 MB Avg: 149.49 MiB/s (-3.07%) Median: 150.85 MiB/s (-1.45%) [160.76 MiB/s .. 130.94 MiB/s] zipfs numbers 1mio hashmap Memory: 1.3 MB Avg: 1.2185 GiB/s (-2.33%) Median: 1.2291 GiB/s (-2.33%) [1.2905 GiB/s .. 1.0742 GiB/s] hasmap with postings Memory: 5.5 MB Avg: 358.43 MiB/s (-11.63%) Median: 356.95 MiB/s (-12.85%) [444.94 MiB/s .. 302.46 MiB/s] numbers unique 2mio hashmap Memory: 70.3 MB Avg: 163.65 MiB/s (+8.37%) Median: 162.83 MiB/s (+8.80%) [190.20 MiB/s .. 144.70 MiB/s] hasmap with postings Memory: 78.6 MB Avg: 148.00 MiB/s (+7.75%) Median: 151.53 MiB/s (+9.11%) [166.92 MiB/s .. 120.09 MiB/s] zipfs numbers 2mio hashmap Memory: 1.3 MB Avg: 1.2535 GiB/s (+2.59%) Median: 1.2654 GiB/s (+0.36%) [1.2938 GiB/s .. 1.0592 GiB/s] hasmap with postings Memory: 9.7 MB Avg: 377.96 MiB/s (-4.94%) Median: 381.82 MiB/s (-3.67%) [426.14 MiB/s .. 335.66 MiB/s] numbers unique 5mio hashmap Memory: 277.9 MB Avg: 121.30 MiB/s (+2.00%) Median: 121.99 MiB/s (+2.99%) [132.51 MiB/s .. 110.32 MiB/s] hasmap with postings Memory: 295.7 MB Avg: 114.23 MiB/s (+2.13%) Median: 115.26 MiB/s (+2.94%) [124.08 MiB/s .. 103.38 MiB/s] zipfs numbers 5mio hashmap Memory: 1.3 MB Avg: 1.2326 GiB/s (+0.63%) Median: 1.2400 GiB/s (+0.71%) [1.2755 GiB/s .. 1.0923 GiB/s] hasmap with postings Memory: 25.4 MB Avg: 360.49 MiB/s (+1.07%) Median: 363.44 MiB/s (+1.27%) [404.88 MiB/s .. 300.38 MiB/s] ``` * rename bench * update binggan * rename to HASHMAP_CAPACITY	2024-10-16 11:41:33 +08:00
dependabot[bot]	99be20cedd	Update binggan requirement from 0.10.0 to 0.12.0 (#2519 ) * Update binggan requirement from 0.10.0 to 0.12.0 --- updated-dependencies: - dependency-name: binggan dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> * fix build --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>	2024-10-16 11:36:04 +08:00
Bruce Mitchener	5f026901b8	Update MSRV to 1.75 (#2515 ) This is required by the `fs4` dependency. There are other things that need something later than 1.66. Both quickwit and the Python binding already require something newer.	2024-10-16 10:32:16 +08:00
baishen	6dfa2df06f	fix OwnedBytes debug panic (#2512 )	2024-10-16 10:31:40 +08:00