add key_as_string for numbers in term agg

add support for str fast field range query (#2453 )
* add support for str fast field range query Add support for range queries on fast fields, by converting term bounds to term ordinals bounds. closes https://github.com/quickwit-oss/tantivy/issues/2023 * extend tests, rename * update comment * update comment
2026-02-14 20:00:37 +00:00 · 2024-07-25 13:20:56 +08:00 · 2024-07-17 09:31:42 +08:00 · 2024-07-15 18:08:05 +08:00 · 2024-07-09 15:12:22 +08:00 · 2024-07-05 12:04:18 +08:00
112 changed files with 5849 additions and 4121 deletions
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -15,11 +15,11 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - name: Install Rust
-        run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
+        run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
      - uses: Swatinem/rust-cache@v2
      - uses: taiki-e/install-action@cargo-llvm-cov
      - name: Generate code coverage
-        run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
+        run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        continue-on-error: true
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,12 +11,11 @@ repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 edition = "2021"
-rust-version = "1.63"
+rust-version = "1.66"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
-# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
-oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
+oneshot = "0.1.7"
 base64 = "0.22.0"
 byteorder = "1.4.3"
 crc32fast = "1.3.2"
@@ -39,7 +38,7 @@ levenshtein_automata = "0.2.1"
 uuid = { version = "1.0.0", features = ["v4", "serde"] }
 crossbeam-channel = "0.5.4"
 rust-stemmers = "1.2.0"
-downcast-rs = "1.2.0"
+downcast-rs = "1.2.1"
 bitpacking = { version = "0.9.2", default-features = false, features = [
    "bitpacker4x",
 ] }
@@ -64,7 +63,8 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
 tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
 common = { version = "0.7", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
-sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
+sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
+hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
 futures-util = { version = "0.3.28", optional = true }
 fnv = "1.0.7"

--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.

 ## Benchmark

-The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
+The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
 performance for different types of queries/collections.

 Your mileage WILL vary depending on the nature of queries and their load.
@@ -101,7 +101,8 @@ cargo test
 ## Companies Using Tantivy

 <p align="left">
-<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp;
+<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" /> &nbsp;
+<img align="center" src="doc/assets/images/paradedb.png" alt="ParadeDB" height="25" width="auto" /> &nbsp;
 <img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" /> &nbsp;
 <img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
 <img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
--- a/benches/agg_bench.rs
+++ b/benches/agg_bench.rs
@@ -47,13 +47,19 @@ fn bench_agg(mut group: InputGroup<Index>) {
    register!(group, average_f64);
    register!(group, average_f64_u64);
    register!(group, stats_f64);
+    register!(group, extendedstats_f64);
    register!(group, percentiles_f64);
    register!(group, terms_few);
    register!(group, terms_many);
+    register!(group, terms_many_top_1000);
    register!(group, terms_many_order_by_term);
    register!(group, terms_many_with_top_hits);
    register!(group, terms_many_with_avg_sub_agg);
-    register!(group, terms_many_json_mixed_type_with_sub_agg_card);
+    register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
+
+    register!(group, cardinality_agg);
+    register!(group, terms_few_with_cardinality_agg);
+
    register!(group, range_agg);
    register!(group, range_agg_with_avg_sub_agg);
    register!(group, range_agg_with_term_agg_few);
@@ -105,7 +111,12 @@ fn stats_f64(index: &Index) {
    });
    exec_term_with_agg(index, agg_req)
 }
-
+fn extendedstats_f64(index: &Index) {
+    let agg_req = json!({
+        "extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
+    });
+    exec_term_with_agg(index, agg_req)
+}
 fn percentiles_f64(index: &Index) {
    let agg_req = json!({
      "mypercentiles": {
@@ -117,6 +128,33 @@ fn percentiles_f64(index: &Index) {
    });
    execute_agg(index, agg_req);
 }
+
+fn cardinality_agg(index: &Index) {
+    let agg_req = json!({
+        "cardinality": {
+            "cardinality": {
+                "field": "text_many_terms"
+            },
+        }
+    });
+    execute_agg(index, agg_req);
+}
+fn terms_few_with_cardinality_agg(index: &Index) {
+    let agg_req = json!({
+        "my_texts": {
+            "terms": { "field": "text_few_terms" },
+            "aggs": {
+                "cardinality": {
+                    "cardinality": {
+                        "field": "text_many_terms"
+                    },
+                }
+            }
+        },
+    });
+    execute_agg(index, agg_req);
+}
+
 fn terms_few(index: &Index) {
    let agg_req = json!({
        "my_texts": { "terms": { "field": "text_few_terms" } },
@@ -129,6 +167,12 @@ fn terms_many(index: &Index) {
    });
    execute_agg(index, agg_req);
 }
+fn terms_many_top_1000(index: &Index) {
+    let agg_req = json!({
+        "my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
+    });
+    execute_agg(index, agg_req);
+}
 fn terms_many_order_by_term(index: &Index) {
    let agg_req = json!({
        "my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
@@ -165,7 +209,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
    });
    execute_agg(index, agg_req);
 }
-fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
+fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
    let agg_req = json!({
        "my_texts": {
            "terms": { "field": "json.mixed_type" },
@@ -262,6 +306,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
    });
    execute_agg(index, agg_req);
 }
+
 fn histogram(index: &Index) {
    let agg_req = json!({
        "rangef64": {
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -23,6 +23,16 @@ downcast-rs = "1.2.0"
 proptest = "1"
 more-asserts = "0.3.1"
 rand = "0.8"
+binggan = "0.8.1"
+
+[[bench]]
+name = "bench_merge"
+harness = false
+
+[[bench]]
+name = "bench_access"
+harness = false
+

 [features]
 unstable = []
--- a/columnar/benches/bench_access.rs
+++ b/columnar/benches/bench_access.rs
@@ -0,0 +1,67 @@
+use binggan::{black_box, InputGroup};
+use common::*;
+use tantivy_columnar::Column;
+
+pub mod common;
+
+const NUM_DOCS: u32 = 2_000_000;
+
+pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
+    let reader = generate_columnar_with_name(card, num_docs, "price");
+    reader.read_columns("price").unwrap()[0]
+        .open_u64_lenient()
+        .unwrap()
+        .unwrap()
+}
+
+fn main() {
+    let mut inputs = Vec::new();
+
+    let mut add_card = |card1: Card| {
+        inputs.push((
+            format!("{card1}"),
+            generate_columnar_and_open(card1, NUM_DOCS),
+        ));
+    };
+
+    add_card(Card::MultiSparse);
+    add_card(Card::Multi);
+    add_card(Card::Sparse);
+    add_card(Card::Dense);
+    add_card(Card::Full);
+
+    bench_group(InputGroup::new_with_inputs(inputs));
+}
+
+fn bench_group(mut runner: InputGroup<Column>) {
+    runner.register("access_values_for_doc", |column| {
+        let mut sum = 0;
+        for i in 0..NUM_DOCS {
+            for value in column.values_for_doc(i) {
+                sum += value;
+            }
+        }
+        black_box(sum);
+    });
+    runner.register("access_first_vals", |column| {
+        let mut sum = 0;
+        const BLOCK_SIZE: usize = 32;
+        let mut docs = vec![0; BLOCK_SIZE];
+        let mut buffer = vec![None; BLOCK_SIZE];
+        for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
+            // fill docs
+            for idx in 0..BLOCK_SIZE {
+                docs[idx] = idx as u32 + i;
+            }
+
+            column.first_vals(&docs, &mut buffer);
+            for val in buffer.iter() {
+                let Some(val) = val else { continue };
+                sum += *val;
+            }
+        }
+
+        black_box(sum);
+    });
+    runner.run();
+}
--- a/columnar/benches/bench_first_vals.rs
+++ b/columnar/benches/bench_first_vals.rs
@@ -31,7 +31,7 @@ fn get_test_columns() -> Columns {
    }
    let mut buffer: Vec<u8> = Vec::new();
    dataframe_writer
-        .serialize(data.len() as u32, None, &mut buffer)
+        .serialize(data.len() as u32, &mut buffer)
        .unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();

--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -0,0 +1,49 @@
+pub mod common;
+
+use binggan::{black_box, BenchRunner};
+use common::{generate_columnar_with_name, Card};
+use tantivy_columnar::*;
+
+const NUM_DOCS: u32 = 100_000;
+
+fn main() {
+    let mut inputs = Vec::new();
+
+    let mut add_combo = |card1: Card, card2: Card| {
+        inputs.push((
+            format!("merge_{card1}_and_{card2}"),
+            vec![
+                generate_columnar_with_name(card1, NUM_DOCS, "price"),
+                generate_columnar_with_name(card2, NUM_DOCS, "price"),
+            ],
+        ));
+    };
+
+    add_combo(Card::Multi, Card::Multi);
+    add_combo(Card::MultiSparse, Card::MultiSparse);
+    add_combo(Card::Dense, Card::Dense);
+    add_combo(Card::Sparse, Card::Sparse);
+    add_combo(Card::Sparse, Card::Dense);
+    add_combo(Card::MultiSparse, Card::Dense);
+    add_combo(Card::MultiSparse, Card::Sparse);
+    add_combo(Card::Multi, Card::Dense);
+    add_combo(Card::Multi, Card::Sparse);
+
+    let runner: BenchRunner = BenchRunner::new();
+    let mut group = runner.new_group();
+    for (input_name, columnar_readers) in inputs.iter() {
+        group.register_with_input(
+            input_name,
+            columnar_readers,
+            move |columnar_readers: &Vec<ColumnarReader>| {
+                let mut out = Vec::new();
+                let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
+                let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
+
+                merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
+                black_box(out);
+            },
+        );
+    }
+    group.run();
+}
--- a/columnar/benches/common.rs
+++ b/columnar/benches/common.rs
@@ -0,0 +1,59 @@
+extern crate tantivy_columnar;
+
+use core::fmt;
+use std::fmt::{Display, Formatter};
+
+use tantivy_columnar::{ColumnarReader, ColumnarWriter};
+
+pub enum Card {
+    MultiSparse,
+    Multi,
+    Sparse,
+    Dense,
+    Full,
+}
+impl Display for Card {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            Card::MultiSparse => write!(f, "multi sparse 1/13"),
+            Card::Multi => write!(f, "multi 2x"),
+            Card::Sparse => write!(f, "sparse 1/13"),
+            Card::Dense => write!(f, "dense 1/12"),
+            Card::Full => write!(f, "full"),
+        }
+    }
+}
+pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
+    let mut columnar_writer = ColumnarWriter::default();
+
+    if let Card::MultiSparse = card {
+        columnar_writer.record_numerical(0, column_name, 10u64);
+        columnar_writer.record_numerical(0, column_name, 10u64);
+    }
+
+    for i in 0..num_docs {
+        match card {
+            Card::MultiSparse | Card::Sparse => {
+                if i % 13 == 0 {
+                    columnar_writer.record_numerical(i, column_name, i as u64);
+                }
+            }
+            Card::Dense => {
+                if i % 12 == 0 {
+                    columnar_writer.record_numerical(i, column_name, i as u64);
+                }
+            }
+            Card::Full => {
+                columnar_writer.record_numerical(i, column_name, i as u64);
+            }
+            Card::Multi => {
+                columnar_writer.record_numerical(i, column_name, i as u64);
+                columnar_writer.record_numerical(i, column_name, i as u64);
+            }
+        }
+    }
+
+    let mut wrt: Vec<u8> = Vec::new();
+    columnar_writer.serialize(num_docs, &mut wrt).unwrap();
+    ColumnarReader::open(wrt).unwrap()
+}
--- a/columnar/compat_tests_data/v1.columnar
+++ b/columnar/compat_tests_data/v1.columnar
--- a/columnar/compat_tests_data/v2.columnar
+++ b/columnar/compat_tests_data/v2.columnar
--- a/columnar/src/column/mod.rs
+++ b/columnar/src/column/mod.rs
@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
            .map(|value_row_id: RowId| self.values.get_val(value_row_id))
    }

-    /// Get the docids of values which are in the provided value range.
+    /// Get the docids of values which are in the provided value and docid range.
    #[inline]
    pub fn get_docids_for_value_range(
        &self,
--- a/columnar/src/column/serialize.rs
+++ b/columnar/src/column/serialize.rs
@@ -12,7 +12,7 @@ use crate::column_values::{
    CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
 };
 use crate::iterable::Iterable;
-use crate::StrColumn;
+use crate::{StrColumn, Version};

 pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
    column_index: SerializableColumnIndex<'_>,
@@ -40,25 +40,9 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
    Ok(())
 }

-pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
-    let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
-    let column_index_num_bytes = u32::from_le_bytes(
-        column_index_num_bytes_payload
-            .as_slice()
-            .try_into()
-            .unwrap(),
-    );
-    let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
-    let column_index = crate::column_index::open_column_index(column_index_data)?;
-    let column_values = load_u64_based_column_values(column_values_data)?;
-    Ok(Column {
-        index: column_index,
-        values: column_values,
-    })
-}
-
-pub fn open_column_u128<T: MonotonicallyMappableToU128>(
+pub fn open_column_u64<T: MonotonicallyMappableToU64>(
    bytes: OwnedBytes,
+    format_version: Version,
 ) -> io::Result<Column<T>> {
    let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
    let column_index_num_bytes = u32::from_le_bytes(
@@ -68,7 +52,27 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
            .unwrap(),
    );
    let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
-    let column_index = crate::column_index::open_column_index(column_index_data)?;
+    let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
+    let column_values = load_u64_based_column_values(column_values_data)?;
+    Ok(Column {
+        index: column_index,
+        values: column_values,
+    })
+}
+
+pub fn open_column_u128<T: MonotonicallyMappableToU128>(
+    bytes: OwnedBytes,
+    format_version: Version,
+) -> io::Result<Column<T>> {
+    let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
+    let column_index_num_bytes = u32::from_le_bytes(
+        column_index_num_bytes_payload
+            .as_slice()
+            .try_into()
+            .unwrap(),
+    );
+    let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
+    let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
    let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
    Ok(Column {
        index: column_index,
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
 /// Open the column as u64.
 ///
 /// See [`open_u128_as_compact_u64`] for more details.
-pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
+pub fn open_column_u128_as_compact_u64(
+    bytes: OwnedBytes,
+    format_version: Version,
+) -> io::Result<Column<u64>> {
    let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
    let column_index_num_bytes = u32::from_le_bytes(
        column_index_num_bytes_payload
@@ -88,7 +95,7 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
            .unwrap(),
    );
    let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
-    let column_index = crate::column_index::open_column_index(column_index_data)?;
+    let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
    let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
    Ok(Column {
        index: column_index,
@@ -96,19 +103,19 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
    })
 }

-pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
+pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
    let (body, dictionary_len_bytes) = data.rsplit(4);
    let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
    let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
    let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
-    let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
+    let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
    Ok(BytesColumn {
        dictionary,
        term_ord_column,
    })
 }

-pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
-    let bytes_column = open_column_bytes(data)?;
+pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
+    let bytes_column = open_column_bytes(data, format_version)?;
    Ok(StrColumn::wrap(bytes_column))
 }
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(

 #[cfg(test)]
 mod tests {
+    use common::OwnedBytes;
+
    use crate::column_index::merge::detect_cardinality;
-    use crate::column_index::multivalued_index::MultiValueIndex;
+    use crate::column_index::multivalued_index::{
+        open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
+    };
    use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
    use crate::{
        Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
@@ -171,7 +175,11 @@ mod tests {
        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
            panic!("Excpected a multivalued index")
        };
-        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
+        let mut output = Vec::new();
+        serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
+        let multivalue =
+            open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
+        let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
        assert_eq!(&start_indexes, &[0, 3, 5]);
    }

@@ -200,11 +208,16 @@ mod tests {
            ],
        )
        .into();
+
        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
            panic!("Excpected a multivalued index")
        };
-        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
+        let mut output = Vec::new();
+        serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
+        let multivalue =
+            open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
+        let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
        assert_eq!(&start_indexes, &[0, 3, 5, 6]);
    }
 }
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -1,6 +1,8 @@
 use std::iter;

-use crate::column_index::{SerializableColumnIndex, Set};
+use crate::column_index::{
+    SerializableColumnIndex, SerializableMultivalueIndex, SerializableOptionalIndex, Set,
+};
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};

@@ -14,15 +16,24 @@ pub fn merge_column_index_shuffled<'a>(
        Cardinality::Optional => {
            let non_null_row_ids =
                merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
-            SerializableColumnIndex::Optional {
+            SerializableColumnIndex::Optional(SerializableOptionalIndex {
                non_null_row_ids,
                num_rows: shuffle_merge_order.num_rows(),
-            }
+            })
        }
        Cardinality::Multivalued => {
-            let multivalue_start_index =
-                merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
-            SerializableColumnIndex::Multivalued(multivalue_start_index)
+            let non_null_row_ids =
+                merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
+            SerializableColumnIndex::Multivalued(SerializableMultivalueIndex {
+                doc_ids_with_values: SerializableOptionalIndex {
+                    non_null_row_ids,
+                    num_rows: shuffle_merge_order.num_rows(),
+                },
+                start_offsets: merge_column_index_shuffled_multivalued(
+                    column_indexes,
+                    shuffle_merge_order,
+                ),
+            })
        }
    }
 }
@@ -102,11 +113,18 @@ fn iter_num_values<'a>(

 /// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
 /// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
+///
+/// This will filter values with 0 values as these are covered by the optional index in the
+/// multivalue index.
 fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
-    iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
-        *state += num_vals;
-        Some(*state)
-    }))
+    iter::once(0u32).chain(
+        num_vals
+            .filter(|num_vals| *num_vals != 0)
+            .scan(0, |state, num_vals| {
+                *state += num_vals;
+                Some(*state)
+            }),
+    )
 }

 impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
@@ -134,7 +152,7 @@ mod tests {

    #[test]
    fn test_integrate_num_vals_several() {
-        assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
+        assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 13, 33].into_iter()));
    }

    #[test]
@@ -157,10 +175,10 @@ mod tests {
            Cardinality::Optional,
            &shuffle_merge_order,
        );
-        let SerializableColumnIndex::Optional {
+        let SerializableColumnIndex::Optional(SerializableOptionalIndex {
            non_null_row_ids,
            num_rows,
-        } = serializable_index
+        }) = serializable_index
        else {
            panic!()
        };
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -1,6 +1,8 @@
-use std::iter;
+use std::ops::Range;

-use crate::column_index::{SerializableColumnIndex, Set};
+use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
+use crate::column_index::serialize::SerializableOptionalIndex;
+use crate::column_index::SerializableColumnIndex;
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};

@@ -15,23 +17,149 @@ pub fn merge_column_index_stacked<'a>(
 ) -> SerializableColumnIndex<'a> {
    match cardinality_after_merge {
        Cardinality::Full => SerializableColumnIndex::Full,
-        Cardinality::Optional => SerializableColumnIndex::Optional {
+        Cardinality::Optional => SerializableColumnIndex::Optional(SerializableOptionalIndex {
            non_null_row_ids: Box::new(StackedOptionalIndex {
                columns,
                stack_merge_order,
            }),
            num_rows: stack_merge_order.num_rows(),
-        },
+        }),
        Cardinality::Multivalued => {
-            let stacked_multivalued_index = StackedMultivaluedIndex {
-                columns,
-                stack_merge_order,
-            };
-            SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
+            let serializable_multivalue_index =
+                make_serializable_multivalued_index(columns, stack_merge_order);
+            SerializableColumnIndex::Multivalued(serializable_multivalue_index)
        }
    }
 }

+struct StackedDocIdsWithValues<'a> {
+    column_indexes: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+}
+
+impl Iterable<u32> for StackedDocIdsWithValues<'_> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
+        Box::new((0..self.column_indexes.len()).flat_map(|i| {
+            let column_index = &self.column_indexes[i];
+            let doc_range = self.stack_merge_order.columnar_range(i);
+            get_doc_ids_with_values(column_index, doc_range)
+        }))
+    }
+}
+
+fn get_doc_ids_with_values<'a>(
+    column_index: &'a ColumnIndex,
+    doc_range: Range<u32>,
+) -> Box<dyn Iterator<Item = u32> + 'a> {
+    match column_index {
+        ColumnIndex::Empty { .. } => Box::new(0..0),
+        ColumnIndex::Full => Box::new(doc_range),
+        ColumnIndex::Optional(optional_index) => Box::new(
+            optional_index
+                .iter_rows()
+                .map(move |row| row + doc_range.start),
+        ),
+        ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
+            MultiValueIndex::MultiValueIndexV1(multivalued_index) => {
+                Box::new((0..multivalued_index.num_docs()).filter_map(move |docid| {
+                    let range = multivalued_index.range(docid);
+                    if range.is_empty() {
+                        None
+                    } else {
+                        Some(docid + doc_range.start)
+                    }
+                }))
+            }
+            MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
+                multivalued_index
+                    .optional_index
+                    .iter_rows()
+                    .map(move |row| row + doc_range.start),
+            ),
+        },
+    }
+}
+
+fn stack_doc_ids_with_values<'a>(
+    column_indexes: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+) -> SerializableOptionalIndex<'a> {
+    let num_rows = stack_merge_order.num_rows();
+    SerializableOptionalIndex {
+        non_null_row_ids: Box::new(StackedDocIdsWithValues {
+            column_indexes,
+            stack_merge_order,
+        }),
+        num_rows,
+    }
+}
+
+struct StackedStartOffsets<'a> {
+    column_indexes: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+}
+
+fn get_num_values_iterator<'a>(
+    column_index: &'a ColumnIndex,
+    num_docs: u32,
+) -> Box<dyn Iterator<Item = u32> + 'a> {
+    match column_index {
+        ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
+        ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
+        ColumnIndex::Optional(optional_index) => {
+            Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
+        }
+        ColumnIndex::Multivalued(multivalued_index) => Box::new(
+            multivalued_index
+                .get_start_index_column()
+                .iter()
+                .scan(0u32, |previous_start_offset, current_start_offset| {
+                    let num_vals = current_start_offset - *previous_start_offset;
+                    *previous_start_offset = current_start_offset;
+                    Some(num_vals)
+                })
+                .skip(1),
+        ),
+    }
+}
+
+impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
+        let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
+            let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
+            let column_index = &self.column_indexes[columnar_id];
+            get_num_values_iterator(column_index, num_docs)
+        });
+        Box::new(std::iter::once(0u32).chain(num_values_it.into_iter().scan(
+            0u32,
+            |cumulated, el| {
+                *cumulated += el;
+                Some(*cumulated)
+            },
+        )))
+    }
+}
+
+fn stack_start_offsets<'a>(
+    column_indexes: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+) -> Box<dyn Iterable<u32> + 'a> {
+    Box::new(StackedStartOffsets {
+        column_indexes,
+        stack_merge_order,
+    })
+}
+
+fn make_serializable_multivalued_index<'a>(
+    columns: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+) -> SerializableMultivalueIndex<'a> {
+    SerializableMultivalueIndex {
+        doc_ids_with_values: stack_doc_ids_with_values(columns, stack_merge_order),
+        start_offsets: stack_start_offsets(columns, stack_merge_order),
+    }
+}
+
 struct StackedOptionalIndex<'a> {
    columns: &'a [ColumnIndex],
    stack_merge_order: &'a StackMergeOrder,
@@ -62,87 +190,3 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
        )
    }
 }
-
-#[derive(Clone, Copy)]
-struct StackedMultivaluedIndex<'a> {
-    columns: &'a [ColumnIndex],
-    stack_merge_order: &'a StackMergeOrder,
-}
-
-fn convert_column_opt_to_multivalued_index<'a>(
-    column_index_opt: &'a ColumnIndex,
-    num_rows: RowId,
-) -> Box<dyn Iterator<Item = RowId> + 'a> {
-    match column_index_opt {
-        ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
-        ColumnIndex::Full => Box::new(0..num_rows + 1),
-        ColumnIndex::Optional(optional_index) => {
-            Box::new(
-                (0..num_rows)
-                    // TODO optimize
-                    .map(|row_id| optional_index.rank(row_id))
-                    .chain(std::iter::once(optional_index.num_non_nulls())),
-            )
-        }
-        ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
-    }
-}
-
-impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
-    fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
-        let multivalued_indexes =
-            self.columns
-                .iter()
-                .enumerate()
-                .map(|(columnar_id, column_opt)| {
-                    let num_rows =
-                        self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
-                    convert_column_opt_to_multivalued_index(column_opt, num_rows)
-                });
-        stack_multivalued_indexes(multivalued_indexes)
-    }
-}
-
-// Refactor me
-fn stack_multivalued_indexes<'a>(
-    mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
-) -> Box<dyn Iterator<Item = RowId> + 'a> {
-    let mut offset = 0;
-    let mut last_row_id = 0;
-    let mut current_it = multivalued_indexes.next();
-    Box::new(std::iter::from_fn(move || loop {
-        if let Some(row_id) = current_it.as_mut()?.next() {
-            last_row_id = offset + row_id;
-            return Some(last_row_id);
-        }
-        offset = last_row_id;
-        loop {
-            current_it = multivalued_indexes.next();
-            if current_it.as_mut()?.next().is_some() {
-                break;
-            }
-        }
-    }))
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::RowId;
-
-    fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
-        Box::new(row_ids.iter().copied())
-    }
-
-    #[test]
-    fn test_stack() {
-        let columns = [
-            it(&[0u32, 0u32]),
-            it(&[0u32, 1u32, 1u32, 4u32]),
-            it(&[0u32, 3u32, 5u32]),
-            it(&[0u32, 4u32]),
-        ]
-        .into_iter();
-        let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
-        assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
-    }
-}
--- a/columnar/src/column_index/mod.rs
+++ b/columnar/src/column_index/mod.rs
@@ -11,8 +11,11 @@ mod serialize;
 use std::ops::Range;

 pub use merge::merge_column_index;
+pub(crate) use multivalued_index::SerializableMultivalueIndex;
 pub use optional_index::{OptionalIndex, Set};
-pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
+pub use serialize::{
+    open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
+};

 use crate::column_index::multivalued_index::MultiValueIndex;
 use crate::{Cardinality, DocId, RowId};
@@ -131,15 +134,41 @@ impl ColumnIndex {
                let row_end = optional_index.rank(doc_id_range.end);
                row_start..row_end
            }
-            ColumnIndex::Multivalued(multivalued_index) => {
-                let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1;
-                let start_docid = doc_id_range.start.min(end_docid);
+            ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
+                MultiValueIndex::MultiValueIndexV1(index) => {
+                    let row_start = index.start_index_column.get_val(doc_id_range.start);
+                    let row_end = index.start_index_column.get_val(doc_id_range.end);
+                    row_start..row_end
+                }
+                MultiValueIndex::MultiValueIndexV2(index) => {
+                    // In this case we will use the optional_index select the next values
+                    // that are valid. There are different cases to consider:
+                    // Not exists below means does not exist in the optional
+                    // index, because it has no values.
+                    // * doc_id_range may cover a range of docids which are non existent
+                    // => rank
+                    //   will give us the next document outside the range with a value. They both
+                    //   get the same rank and therefore return a zero range
+                    //
+                    // * doc_id_range.start and doc_id_range.end may not exist, but docids in
+                    // between may have values
+                    // => rank will give us the next document outside the range with a value.
+                    //
+                    // * doc_id_range.start may be not existent but doc_id_range.end may exist
+                    // * doc_id_range.start may exist but doc_id_range.end may not exist
+                    // * doc_id_range.start and doc_id_range.end may exist
+                    // => rank on doc_id_range.end will give use the next value, which matches
+                    // how the `start_index_column` works, so we get the value start of the next
+                    // docid which we use to create the exclusive range.
+                    //
+                    let rank_start = index.optional_index.rank(doc_id_range.start);
+                    let row_start = index.start_index_column.get_val(rank_start);
+                    let rank_end = index.optional_index.rank(doc_id_range.end);
+                    let row_end = index.start_index_column.get_val(rank_end);

-                let row_start = multivalued_index.start_index_column.get_val(start_docid);
-                let row_end = multivalued_index.start_index_column.get_val(end_docid);
-
-                row_start..row_end
-            }
+                    row_start..row_end
+                }
+            },
        }
    }

--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -3,64 +3,98 @@ use std::io::Write;
 use std::ops::Range;
 use std::sync::Arc;

-use common::OwnedBytes;
+use common::{CountingWriter, OwnedBytes};

+use super::optional_index::{open_optional_index, serialize_optional_index};
+use super::{OptionalIndex, SerializableOptionalIndex, Set};
 use crate::column_values::{
    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
 };
 use crate::iterable::Iterable;
-use crate::{DocId, RowId};
+use crate::{DocId, RowId, Version};
+
+pub struct SerializableMultivalueIndex<'a> {
+    pub doc_ids_with_values: SerializableOptionalIndex<'a>,
+    pub start_offsets: Box<dyn Iterable<u32> + 'a>,
+}

 pub fn serialize_multivalued_index(
-    multivalued_index: &dyn Iterable<RowId>,
+    multivalued_index: &SerializableMultivalueIndex,
    output: &mut impl Write,
 ) -> io::Result<()> {
+    let SerializableMultivalueIndex {
+        doc_ids_with_values,
+        start_offsets,
+    } = multivalued_index;
+    let mut count_writer = CountingWriter::wrap(output);
+    let SerializableOptionalIndex {
+        non_null_row_ids,
+        num_rows,
+    } = doc_ids_with_values;
+    serialize_optional_index(&**non_null_row_ids, *num_rows, &mut count_writer)?;
+    let optional_len = count_writer.written_bytes() as u32;
+    let output = count_writer.finish();
    serialize_u64_based_column_values(
-        multivalued_index,
+        &**start_offsets,
        &[CodecType::Bitpacked, CodecType::Linear],
        output,
    )?;
+    output.write_all(&optional_len.to_le_bytes())?;
    Ok(())
 }

-pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
-    let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
-    Ok(MultiValueIndex { start_index_column })
+pub fn open_multivalued_index(
+    bytes: OwnedBytes,
+    format_version: Version,
+) -> io::Result<MultiValueIndex> {
+    match format_version {
+        Version::V1 => {
+            let start_index_column: Arc<dyn ColumnValues<RowId>> =
+                load_u64_based_column_values(bytes)?;
+            Ok(MultiValueIndex::MultiValueIndexV1(MultiValueIndexV1 {
+                start_index_column,
+            }))
+        }
+        Version::V2 => {
+            let (body_bytes, optional_index_len) = bytes.rsplit(4);
+            let optional_index_len =
+                u32::from_le_bytes(optional_index_len.as_slice().try_into().unwrap());
+            let (optional_index_bytes, start_index_bytes) =
+                body_bytes.split(optional_index_len as usize);
+            let optional_index = open_optional_index(optional_index_bytes)?;
+            let start_index_column: Arc<dyn ColumnValues<RowId>> =
+                load_u64_based_column_values(start_index_bytes)?;
+            Ok(MultiValueIndex::MultiValueIndexV2(MultiValueIndexV2 {
+                optional_index,
+                start_index_column,
+            }))
+        }
+    }
 }

 #[derive(Clone)]
 /// Index to resolve value range for given doc_id.
 /// Starts at 0.
-pub struct MultiValueIndex {
+pub enum MultiValueIndex {
+    MultiValueIndexV1(MultiValueIndexV1),
+    MultiValueIndexV2(MultiValueIndexV2),
+}
+
+#[derive(Clone)]
+/// Index to resolve value range for given doc_id.
+/// Starts at 0.
+pub struct MultiValueIndexV1 {
    pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
 }

-impl std::fmt::Debug for MultiValueIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("MultiValuedIndex")
-            .field("num_rows", &self.start_index_column.num_vals())
-            .finish_non_exhaustive()
-    }
-}
-
-impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
-    fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
-        MultiValueIndex { start_index_column }
-    }
-}
-
-impl MultiValueIndex {
-    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
-        let mut buffer = Vec::new();
-        serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
-        let bytes = OwnedBytes::new(buffer);
-        open_multivalued_index(bytes).unwrap()
-    }
-
+impl MultiValueIndexV1 {
    /// Returns `[start, end)`, such that the values associated with
    /// the given document are `start..end`.
    #[inline]
    pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
+        if doc_id >= self.num_docs() {
+            return 0..0;
+        }
        let start = self.start_index_column.get_val(doc_id);
        let end = self.start_index_column.get_val(doc_id + 1);
        start..end
@@ -83,7 +117,6 @@ impl MultiValueIndex {
    ///
    /// TODO: Instead of a linear scan we can employ a exponential search into binary search to
    /// match a docid to its value position.
-    #[allow(clippy::bool_to_int_with_if)]
    pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
        if ranks.is_empty() {
            return;
@@ -111,11 +144,170 @@ impl MultiValueIndex {
    }
 }

+#[derive(Clone)]
+/// Index to resolve value range for given doc_id.
+/// Starts at 0.
+pub struct MultiValueIndexV2 {
+    pub optional_index: OptionalIndex,
+    pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
+}
+
+impl std::fmt::Debug for MultiValueIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let index = match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
+            MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
+        };
+        f.debug_struct("MultiValuedIndex")
+            .field("num_rows", &index.num_vals())
+            .finish_non_exhaustive()
+    }
+}
+
+impl MultiValueIndex {
+    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
+        assert!(!start_offsets.is_empty());
+        assert_eq!(start_offsets[0], 0);
+        let mut doc_with_values = Vec::new();
+        let mut compact_start_offsets: Vec<u32> = vec![0];
+        for doc in 0..start_offsets.len() - 1 {
+            if start_offsets[doc] < start_offsets[doc + 1] {
+                doc_with_values.push(doc as RowId);
+                compact_start_offsets.push(start_offsets[doc + 1]);
+            }
+        }
+        let serializable_multivalued_index = SerializableMultivalueIndex {
+            doc_ids_with_values: SerializableOptionalIndex {
+                non_null_row_ids: Box::new(&doc_with_values[..]),
+                num_rows: start_offsets.len() as u32 - 1,
+            },
+            start_offsets: Box::new(&compact_start_offsets[..]),
+        };
+        let mut buffer = Vec::new();
+        serialize_multivalued_index(&serializable_multivalued_index, &mut buffer).unwrap();
+        let bytes = OwnedBytes::new(buffer);
+        open_multivalued_index(bytes, Version::V2).unwrap()
+    }
+
+    pub fn get_start_index_column(&self) -> &Arc<dyn crate::ColumnValues<RowId>> {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
+            MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
+        }
+    }
+
+    /// Returns `[start, end)` values range, such that the values associated with
+    /// the given document are `start..end`.
+    #[inline]
+    pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => idx.range(doc_id),
+            MultiValueIndex::MultiValueIndexV2(idx) => idx.range(doc_id),
+        }
+    }
+
+    /// Returns the number of documents in the index.
+    #[inline]
+    pub fn num_docs(&self) -> u32 {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => idx.start_index_column.num_vals() - 1,
+            MultiValueIndex::MultiValueIndexV2(idx) => idx.optional_index.num_docs(),
+        }
+    }
+
+    /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
+    /// docids. Positions are converted inplace to docids.
+    ///
+    /// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
+    /// index.
+    ///
+    /// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
+    /// increasing positions.
+    ///
+    /// TODO: Instead of a linear scan we can employ a exponential search into binary search to
+    /// match a docid to its value position.
+    pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
+        match self {
+            MultiValueIndex::MultiValueIndexV1(idx) => {
+                idx.select_batch_in_place(docid_start, ranks)
+            }
+            MultiValueIndex::MultiValueIndexV2(idx) => {
+                idx.select_batch_in_place(docid_start, ranks)
+            }
+        }
+    }
+}
+impl MultiValueIndexV2 {
+    /// Returns `[start, end)`, such that the values associated with
+    /// the given document are `start..end`.
+    #[inline]
+    pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
+        let Some(rank) = self.optional_index.rank_if_exists(doc_id) else {
+            return 0..0;
+        };
+        let start = self.start_index_column.get_val(rank);
+        let end = self.start_index_column.get_val(rank + 1);
+        start..end
+    }
+
+    /// Returns the number of documents in the index.
+    #[inline]
+    pub fn num_docs(&self) -> u32 {
+        self.optional_index.num_docs()
+    }
+
+    /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
+    /// docids. Positions are converted inplace to docids.
+    ///
+    /// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
+    /// index.
+    ///
+    /// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
+    /// increasing positions.
+    ///
+    /// TODO: Instead of a linear scan we can employ a exponential search into binary search to
+    /// match a docid to its value position.
+    pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
+        if ranks.is_empty() {
+            return;
+        }
+        let mut cur_pos_in_idx = self.optional_index.rank(docid_start);
+        let mut last_doc = None;
+
+        assert!(cur_pos_in_idx <= ranks[0]);
+
+        let mut write_doc_pos = 0;
+        for i in 0..ranks.len() {
+            let pos = ranks[i];
+            loop {
+                let end = self.start_index_column.get_val(cur_pos_in_idx + 1);
+                if end > pos {
+                    ranks[write_doc_pos] = cur_pos_in_idx;
+                    write_doc_pos += if last_doc == Some(cur_pos_in_idx) {
+                        0
+                    } else {
+                        1
+                    };
+                    last_doc = Some(cur_pos_in_idx);
+                    break;
+                }
+                cur_pos_in_idx += 1;
+            }
+        }
+        ranks.truncate(write_doc_pos);
+
+        for rank in ranks.iter_mut() {
+            *rank = self.optional_index.select(*rank);
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::ops::Range;

    use super::MultiValueIndex;
+    use crate::{ColumnarReader, DynamicColumn};

    fn index_to_pos_helper(
        index: &MultiValueIndex,
@@ -134,6 +326,7 @@ mod tests {
        let positions = &[10u32, 11, 15, 20, 21, 22];
        assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
        assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
+
        assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
        assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
        assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
@@ -141,4 +334,67 @@ mod tests {
        assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
        assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
    }
+
+    #[test]
+    fn test_range_to_rowids() {
+        use crate::ColumnarWriter;
+
+        let mut columnar_writer = ColumnarWriter::default();
+
+        // This column gets coerced to u64
+        columnar_writer.record_numerical(1, "full", u64::MAX);
+        columnar_writer.record_numerical(1, "full", u64::MAX);
+
+        columnar_writer.record_numerical(5, "full", u64::MAX);
+        columnar_writer.record_numerical(5, "full", u64::MAX);
+
+        let mut wrt: Vec<u8> = Vec::new();
+        columnar_writer.serialize(7, &mut wrt).unwrap();
+
+        let reader = ColumnarReader::open(wrt).unwrap();
+        // Open the column as u64
+        let column = reader.read_columns("full").unwrap()[0]
+            .open()
+            .unwrap()
+            .coerce_numerical(crate::NumericalType::U64)
+            .unwrap();
+        let DynamicColumn::U64(column) = column else {
+            panic!();
+        };
+
+        let row_id_range = column.index.docid_range_to_rowids(1..2);
+        assert_eq!(row_id_range, 0..2);
+
+        let row_id_range = column.index.docid_range_to_rowids(0..2);
+        assert_eq!(row_id_range, 0..2);
+
+        let row_id_range = column.index.docid_range_to_rowids(0..4);
+        assert_eq!(row_id_range, 0..2);
+
+        let row_id_range = column.index.docid_range_to_rowids(3..4);
+        assert_eq!(row_id_range, 2..2);
+
+        let row_id_range = column.index.docid_range_to_rowids(1..6);
+        assert_eq!(row_id_range, 0..4);
+
+        let row_id_range = column.index.docid_range_to_rowids(3..6);
+        assert_eq!(row_id_range, 2..4);
+
+        let row_id_range = column.index.docid_range_to_rowids(0..6);
+        assert_eq!(row_id_range, 0..4);
+
+        let row_id_range = column.index.docid_range_to_rowids(0..6);
+        assert_eq!(row_id_range, 0..4);
+
+        let check = |range, expected| {
+            let full_range = 0..=u64::MAX;
+            let mut docids = Vec::new();
+            column.get_docids_for_value_range(full_range, range, &mut docids);
+            assert_eq!(docids, expected);
+        };
+
+        // check(0..1, vec![]);
+        // check(0..2, vec![1]);
+        check(1..2, vec![1]);
+    }
 }
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -86,8 +86,14 @@ pub struct OptionalIndex {
    block_metas: Arc<[BlockMeta]>,
 }

+impl<'a> Iterable<u32> for &'a OptionalIndex {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
+        Box::new(self.iter_rows())
+    }
+}
+
 impl std::fmt::Debug for OptionalIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        f.debug_struct("OptionalIndex")
            .field("num_rows", &self.num_rows)
            .field("num_non_null_rows", &self.num_non_null_rows)
@@ -196,6 +202,7 @@ impl Set<RowId> for OptionalIndex {
        } = row_addr_from_row_id(doc_id);
        let block_meta = self.block_metas[block_id as usize];
        let block = self.block(block_meta);
+
        let block_offset_row_id = match block {
            Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
            Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
--- a/columnar/src/column_index/optional_index/set.rs
+++ b/columnar/src/column_index/optional_index/set.rs
@@ -28,10 +28,11 @@ pub trait Set<T> {
    /// Returns true if the elements is contained in the Set
    fn contains(&self, el: T) -> bool;

-    /// Returns the number of rows in the set that are < `el`
+    /// Returns the element's rank (its position in the set).
+    /// If the set does not contain the element, it will return the next existing elements rank.
    fn rank(&self, el: T) -> T;

-    /// If the set contains `el` returns the element rank.
+    /// If the set contains `el`, returns the element's rank (its position in the set).
    /// If the set does not contain the element, it returns `None`.
    fn rank_if_exists(&self, el: T) -> Option<T>;

--- a/columnar/src/column_index/optional_index/set_block/tests.rs
+++ b/columnar/src/column_index/optional_index/set_block/tests.rs
@@ -22,8 +22,8 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
            vals.iter().cloned().take_while(|v| *v < val).count() as u16
        );
    }
-    for rank in 0..vals.len() {
-        assert_eq!(tested_set.select(rank as u16), vals[rank]);
+    for (rank, val) in vals.iter().enumerate() {
+        assert_eq!(tested_set.select(rank as u16), *val);
    }
    buffer.len()
 }
@@ -107,3 +107,41 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
        assert_eq!(i, select_cursor.select(i));
    }
 }
+
+#[test]
+fn test_simple_translate_idx_to_value_idx_dense() {
+    let mut buffer = Vec::new();
+    DenseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
+    let tested_set = DenseBlockCodec::open(buffer.as_slice());
+    assert!(tested_set.contains(1));
+    assert!(!tested_set.contains(2));
+    assert_eq!(tested_set.rank(0), 0);
+    assert_eq!(tested_set.rank(1), 0);
+    for rank in 2..10 {
+        // ranks that don't exist select the next highest one
+        assert_eq!(tested_set.rank_if_exists(rank), None);
+        assert_eq!(tested_set.rank(rank), 1);
+    }
+    assert_eq!(tested_set.rank(10), 1);
+}
+
+#[test]
+fn test_simple_translate_idx_to_value_idx_sparse() {
+    let mut buffer = Vec::new();
+    SparseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
+    let tested_set = SparseBlockCodec::open(buffer.as_slice());
+    assert!(tested_set.contains(1));
+    assert!(!tested_set.contains(2));
+    assert_eq!(tested_set.rank(0), 0);
+    assert_eq!(tested_set.select(tested_set.rank(0)), 1);
+    assert_eq!(tested_set.rank(1), 0);
+    assert_eq!(tested_set.select(tested_set.rank(1)), 1);
+    for rank in 2..10 {
+        // ranks that don't exist select the next highest one
+        assert_eq!(tested_set.rank_if_exists(rank), None);
+        assert_eq!(tested_set.rank(rank), 1);
+        assert_eq!(tested_set.select(tested_set.rank(rank)), 10);
+    }
+    assert_eq!(tested_set.rank(10), 1);
+    assert_eq!(tested_set.select(tested_set.rank(10)), 10);
+}
--- a/columnar/src/column_index/optional_index/tests.rs
+++ b/columnar/src/column_index/optional_index/tests.rs
@@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) {
    let mut dataframe_writer = ColumnarWriter::default();
    dataframe_writer.record_numerical(100, "score", 80i64);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer
-        .serialize(num_docs, None, &mut buffer)
-        .unwrap();
+    dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -3,28 +3,39 @@ use std::io::Write;

 use common::{CountingWriter, OwnedBytes};

+use super::multivalued_index::SerializableMultivalueIndex;
+use super::OptionalIndex;
 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
 use crate::column_index::ColumnIndex;
 use crate::iterable::Iterable;
-use crate::{Cardinality, RowId};
+use crate::{Cardinality, RowId, Version};
+
+pub struct SerializableOptionalIndex<'a> {
+    pub non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
+    pub num_rows: RowId,
+}
+
+impl<'a> From<&'a OptionalIndex> for SerializableOptionalIndex<'a> {
+    fn from(optional_index: &'a OptionalIndex) -> Self {
+        SerializableOptionalIndex {
+            non_null_row_ids: Box::new(optional_index),
+            num_rows: optional_index.num_docs(),
+        }
+    }
+}

 pub enum SerializableColumnIndex<'a> {
    Full,
-    Optional {
-        non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
-        num_rows: RowId,
-    },
-    // TODO remove the Arc<dyn> apart from serialization this is not
-    // dynamic at all.
-    Multivalued(Box<dyn Iterable<RowId> + 'a>),
+    Optional(SerializableOptionalIndex<'a>),
+    Multivalued(SerializableMultivalueIndex<'a>),
 }

 impl<'a> SerializableColumnIndex<'a> {
    pub fn get_cardinality(&self) -> Cardinality {
        match self {
            SerializableColumnIndex::Full => Cardinality::Full,
-            SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
+            SerializableColumnIndex::Optional(_) => Cardinality::Optional,
            SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
        }
    }
@@ -40,12 +51,12 @@ pub fn serialize_column_index(
    output.write_all(&[cardinality])?;
    match column_index {
        SerializableColumnIndex::Full => {}
-        SerializableColumnIndex::Optional {
+        SerializableColumnIndex::Optional(SerializableOptionalIndex {
            non_null_row_ids,
            num_rows,
-        } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
+        }) => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
        SerializableColumnIndex::Multivalued(multivalued_index) => {
-            serialize_multivalued_index(&*multivalued_index, &mut output)?
+            serialize_multivalued_index(&multivalued_index, &mut output)?
        }
    }
    let column_index_num_bytes = output.written_bytes() as u32;
@@ -53,7 +64,10 @@ pub fn serialize_column_index(
 }

 /// Open a serialized column index.
-pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
+pub fn open_column_index(
+    mut bytes: OwnedBytes,
+    format_version: Version,
+) -> io::Result<ColumnIndex> {
    if bytes.is_empty() {
        return Err(io::Error::new(
            io::ErrorKind::UnexpectedEof,
@@ -70,7 +84,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
            Ok(ColumnIndex::Optional(optional_index))
        }
        Cardinality::Multivalued => {
-            let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
+            let multivalue_index =
+                super::multivalued_index::open_multivalued_index(bytes, format_version)?;
            Ok(ColumnIndex::Multivalued(multivalue_index))
        }
    }
--- a/columnar/src/column_values/bench.rs
+++ b/columnar/src/column_values/bench.rs
@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
 fn value_iter() -> impl Iterator<Item = u64> {
    0..20_000
 }
+
 fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
    let mut bytes = Vec::new();
    let stats = compute_stats(data.iter().cloned());
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
    for val in data {
        codec_serializer.collect(*val);
    }
-    codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
+    codec_serializer
+        .serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
+        .unwrap();

    Codec::load(OwnedBytes::new(bytes)).unwrap()
 }
+
 fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
    let col = get_reader_for_bench::<Codec>(data);
    b.iter(|| {
--- a/columnar/src/columnar/format_version.rs
+++ b/columnar/src/columnar/format_version.rs
@@ -1,3 +1,6 @@
+use core::fmt;
+use std::fmt::{Display, Formatter};
+
 use crate::InvalidData;

 pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
@@ -8,7 +11,7 @@ const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];

 pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
    let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
-    footer_bytes[0..4].copy_from_slice(&Version::V1.to_bytes());
+    footer_bytes[0..4].copy_from_slice(&CURRENT_VERSION.to_bytes());
    footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
    footer_bytes
 }
@@ -20,10 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result<Vers
    Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
 }

+pub const CURRENT_VERSION: Version = Version::V2;
+
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 #[repr(u32)]
 pub enum Version {
    V1 = 1u32,
+    V2 = 2u32,
+}
+
+impl Display for Version {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            Version::V1 => write!(f, "v1"),
+            Version::V2 => write!(f, "v2"),
+        }
+    }
 }

 impl Version {
@@ -35,6 +50,7 @@ impl Version {
        let code = u32::from_le_bytes(bytes);
        match code {
            1u32 => Ok(Version::V1),
+            2u32 => Ok(Version::V2),
            _ => Err(InvalidData),
        }
    }
@@ -47,9 +63,9 @@ mod tests {
    use super::*;

    #[test]
-    fn test_footer_dserialization() {
+    fn test_footer_deserialization() {
        let parsed_version: Version = parse_footer(footer()).unwrap();
-        assert_eq!(Version::V1, parsed_version);
+        assert_eq!(Version::V2, parsed_version);
    }

    #[test]
@@ -63,11 +79,10 @@ mod tests {
        for &i in &version_to_tests {
            let version_res = Version::try_from_bytes(i.to_le_bytes());
            if let Ok(version) = version_res {
-                assert_eq!(version, Version::V1);
                assert_eq!(version.to_bytes(), i.to_le_bytes());
                valid_versions.insert(i);
            }
        }
-        assert_eq!(valid_versions.len(), 1);
+        assert_eq!(valid_versions.len(), 2);
    }
 }
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -7,7 +7,6 @@ use std::io;
 use std::net::Ipv6Addr;
 use std::sync::Arc;

-use itertools::Itertools;
 pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};

 use super::writer::ColumnarSerializer;
@@ -371,20 +370,8 @@ fn is_empty_after_merge(
                        true
                    }
                    ColumnIndex::Multivalued(multivalued_index) => {
-                        for (doc_id, (start_index, end_index)) in multivalued_index
-                            .start_index_column
-                            .iter()
-                            .tuple_windows()
-                            .enumerate()
-                        {
-                            let doc_id = doc_id as u32;
-                            if start_index == end_index {
-                                // There are no values in this document
-                                continue;
-                            }
-                            // The document contains values and is present in the alive bitset.
-                            // The column is therefore not empty.
-                            if alive_bitset.contains(doc_id) {
+                        for alive_docid in alive_bitset.iter() {
+                            if !multivalued_index.range(alive_docid).is_empty() {
                                return false;
                            }
                        }
--- a/columnar/src/columnar/merge/tests.rs
+++ b/columnar/src/columnar/merge/tests.rs
@@ -1,3 +1,5 @@
+use itertools::Itertools;
+
 use super::*;
 use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};

@@ -12,7 +14,7 @@ fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
    }
    let mut buffer: Vec<u8> = Vec::new();
    dataframe_writer
-        .serialize(vals.len() as RowId, None, &mut buffer)
+        .serialize(vals.len() as RowId, &mut buffer)
        .unwrap();
    ColumnarReader::open(buffer).unwrap()
 }
@@ -157,9 +159,7 @@ fn make_numerical_columnar_multiple_columns(
        .max()
        .unwrap_or(0u32);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer
-        .serialize(num_rows, None, &mut buffer)
-        .unwrap();
+    dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
    ColumnarReader::open(buffer).unwrap()
 }

@@ -182,9 +182,7 @@ fn make_byte_columnar_multiple_columns(
        }
    }
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer
-        .serialize(num_rows, None, &mut buffer)
-        .unwrap();
+    dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
    ColumnarReader::open(buffer).unwrap()
 }

@@ -203,9 +201,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
        .max()
        .unwrap_or(0u32);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer
-        .serialize(num_rows, None, &mut buffer)
-        .unwrap();
+    dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
    ColumnarReader::open(buffer).unwrap()
 }

--- a/columnar/src/columnar/mod.rs
+++ b/columnar/src/columnar/mod.rs
@@ -5,6 +5,7 @@ mod reader;
 mod writer;

 pub use column_type::{ColumnType, HasAssociatedColumnType};
+pub use format_version::{Version, CURRENT_VERSION};
 #[cfg(test)]
 pub(crate) use merge::ColumnTypeCategory;
 pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
--- a/columnar/src/columnar/reader/mod.rs
+++ b/columnar/src/columnar/reader/mod.rs
@@ -6,7 +6,7 @@ use sstable::{Dictionary, RangeSSTable};

 use crate::columnar::{format_version, ColumnType};
 use crate::dynamic_column::DynamicColumnHandle;
-use crate::RowId;
+use crate::{RowId, Version};

 fn io_invalid_data(msg: String) -> io::Error {
    io::Error::new(io::ErrorKind::InvalidData, msg)
@@ -19,6 +19,7 @@ pub struct ColumnarReader {
    column_dictionary: Dictionary<RangeSSTable>,
    column_data: FileSlice,
    num_rows: RowId,
+    format_version: Version,
 }

 impl fmt::Debug for ColumnarReader {
@@ -53,6 +54,7 @@ impl fmt::Debug for ColumnarReader {
 fn read_all_columns_in_stream(
    mut stream: sstable::Streamer<'_, RangeSSTable>,
    column_data: &FileSlice,
+    format_version: Version,
 ) -> io::Result<Vec<DynamicColumnHandle>> {
    let mut results = Vec::new();
    while stream.advance() {
@@ -67,6 +69,7 @@ fn read_all_columns_in_stream(
        let dynamic_column_handle = DynamicColumnHandle {
            file_slice,
            column_type,
+            format_version,
        };
        results.push(dynamic_column_handle);
    }
@@ -88,7 +91,7 @@ impl ColumnarReader {
        let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
        let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
            footer_bytes[12..].try_into().unwrap();
-        let _version = format_version::parse_footer(version_footer_bytes)?;
+        let format_version = format_version::parse_footer(version_footer_bytes)?;
        let (column_data, sstable) =
            file_slice_without_sstable_len.split_from_end(sstable_len as usize);
        let column_dictionary = Dictionary::open(sstable)?;
@@ -96,6 +99,7 @@ impl ColumnarReader {
            column_dictionary,
            column_data,
            num_rows,
+            format_version,
        })
    }

@@ -126,6 +130,7 @@ impl ColumnarReader {
                let column_handle = DynamicColumnHandle {
                    file_slice,
                    column_type,
+                    format_version: self.format_version,
                };
                Some((column_name, column_handle))
            } else {
@@ -167,7 +172,7 @@ impl ColumnarReader {
            .stream_for_column_range(column_name)
            .into_stream_async()
            .await?;
-        read_all_columns_in_stream(stream, &self.column_data)
+        read_all_columns_in_stream(stream, &self.column_data, self.format_version)
    }

    /// Get all columns for the given column name.
@@ -176,7 +181,7 @@ impl ColumnarReader {
    /// different types.
    pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
        let stream = self.stream_for_column_range(column_name).into_stream()?;
-        read_all_columns_in_stream(stream, &self.column_data)
+        read_all_columns_in_stream(stream, &self.column_data, self.format_version)
    }

    /// Return the number of columns in the columnar.
@@ -195,7 +200,7 @@ mod tests {
        columnar_writer.record_column_type("col1", ColumnType::Str, false);
        columnar_writer.record_column_type("col2", ColumnType::U64, false);
        let mut buffer = Vec::new();
-        columnar_writer.serialize(1, None, &mut buffer).unwrap();
+        columnar_writer.serialize(1, &mut buffer).unwrap();
        let columnar = ColumnarReader::open(buffer).unwrap();
        let columns = columnar.list_columns().unwrap();
        assert_eq!(columns.len(), 2);
@@ -211,7 +216,7 @@ mod tests {
        columnar_writer.record_column_type("count", ColumnType::U64, false);
        columnar_writer.record_numerical(1, "count", 1u64);
        let mut buffer = Vec::new();
-        columnar_writer.serialize(2, None, &mut buffer).unwrap();
+        columnar_writer.serialize(2, &mut buffer).unwrap();
        let columnar = ColumnarReader::open(buffer).unwrap();
        let columns = columnar.list_columns().unwrap();
        assert_eq!(columns.len(), 1);
--- a/columnar/src/columnar/writer/column_writers.rs
+++ b/columnar/src/columnar/writer/column_writers.rs
@@ -41,31 +41,10 @@ impl ColumnWriter {
    pub(super) fn operation_iterator<'a, V: SymbolValue>(
        &self,
        arena: &MemoryArena,
-        old_to_new_ids_opt: Option<&[RowId]>,
        buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
        buffer.clear();
        self.values.read_to_end(arena, buffer);
-        if let Some(old_to_new_ids) = old_to_new_ids_opt {
-            // TODO avoid the extra deserialization / serialization.
-            let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
-            let mut new_doc = 0u32;
-            let mut cursor = &buffer[..];
-            for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
-                if let ColumnOperation::NewDoc(doc) = &op {
-                    new_doc = old_to_new_ids[*doc as usize];
-                    sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
-                } else {
-                    sorted_ops.push((new_doc, op));
-                }
-            }
-            // stable sort is crucial here.
-            sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
-            buffer.clear();
-            for (_, op) in sorted_ops {
-                buffer.extend_from_slice(op.serialize().as_ref());
-            }
-        }
        let mut cursor: &[u8] = &buffer[..];
        std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
    }
@@ -231,11 +210,9 @@ impl NumericalColumnWriter {
    pub(super) fn operation_iterator<'a>(
        self,
        arena: &MemoryArena,
-        old_to_new_ids: Option<&[RowId]>,
        buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
-        self.column_writer
-            .operation_iterator(arena, old_to_new_ids, buffer)
+        self.column_writer.operation_iterator(arena, buffer)
    }
 }

@@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter {
    pub(super) fn operation_iterator<'a>(
        &self,
        arena: &MemoryArena,
-        old_to_new_ids: Option<&[RowId]>,
        byte_buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
-        self.column_writer
-            .operation_iterator(arena, old_to_new_ids, byte_buffer)
+        self.column_writer.operation_iterator(arena, byte_buffer)
    }
 }

--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -8,11 +8,12 @@ use std::net::Ipv6Addr;

 use column_operation::ColumnOperation;
 pub(crate) use column_writers::CompatibleNumericalTypes;
+use common::json_path_writer::JSON_END_OF_PATH;
 use common::CountingWriter;
 pub(crate) use serializer::ColumnarSerializer;
 use stacker::{Addr, ArenaHashMap, MemoryArena};

-use crate::column_index::SerializableColumnIndex;
+use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
 use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
 use crate::columnar::column_type::ColumnType;
 use crate::columnar::writer::column_writers::{
@@ -43,7 +44,7 @@ struct SpareBuffers {
 /// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
 /// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
 /// let mut wrt: Vec<u8> =  Vec::new();
-/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
+/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
 /// ```
 #[derive(Default)]
 pub struct ColumnarWriter {
@@ -75,63 +76,6 @@ impl ColumnarWriter {
                .sum::<usize>()
    }

-    /// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
-    /// column.
-    ///
-    /// If the column is multivalued, use the first value for scoring.
-    /// If no value is associated to a specific row, the document is assigned
-    /// the lowest possible score.
-    ///
-    /// The sort applied is stable.
-    pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
-        let Some(numerical_col_writer) = self
-            .numerical_field_hash_map
-            .get::<NumericalColumnWriter>(sort_field.as_bytes())
-            .or_else(|| {
-                self.datetime_field_hash_map
-                    .get::<NumericalColumnWriter>(sort_field.as_bytes())
-            })
-        else {
-            return Vec::new();
-        };
-        let mut symbols_buffer = Vec::new();
-        let mut values = Vec::new();
-        let mut start_doc_check_fill = 0;
-        let mut current_doc_opt: Option<RowId> = None;
-        // Assumption: NewDoc will never call the same doc twice and is strictly increasing between
-        // calls
-        for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
-            match op {
-                ColumnOperation::NewDoc(doc) => {
-                    current_doc_opt = Some(doc);
-                }
-                ColumnOperation::Value(numerical_value) => {
-                    if let Some(current_doc) = current_doc_opt {
-                        // Fill up with 0.0 since last doc
-                        values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
-                        start_doc_check_fill = current_doc + 1;
-                        // handle multi values
-                        current_doc_opt = None;
-
-                        let score: f32 = f64::coerce(numerical_value) as f32;
-                        values.push((score, current_doc));
-                    }
-                }
-            }
-        }
-        for doc in values.len() as u32..num_docs {
-            values.push((0.0f32, doc));
-        }
-        values.sort_by(|(left_score, _), (right_score, _)| {
-            if reversed {
-                right_score.total_cmp(left_score)
-            } else {
-                left_score.total_cmp(right_score)
-            }
-        });
-        values.into_iter().map(|(_score, doc)| doc).collect()
-    }
-
    /// Records a column type. This is useful to bypass the coercion process,
    /// makes sure the empty is present in the resulting columnar, or set
    /// the `sort_values_within_row`.
@@ -302,13 +246,9 @@ impl ColumnarWriter {
            },
        );
    }
-    pub fn serialize(
-        &mut self,
-        num_docs: RowId,
-        old_to_new_row_ids: Option<&[RowId]>,
-        wrt: &mut dyn io::Write,
-    ) -> io::Result<()> {
+    pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
        let mut serializer = ColumnarSerializer::new(wrt);
+
        let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
            .numerical_field_hash_map
            .iter()
@@ -322,7 +262,7 @@ impl ColumnarWriter {
        columns.extend(
            self.bytes_field_hash_map
                .iter()
-                .map(|(term, addr)| (term, ColumnType::Bytes, addr)),
+                .map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
        );
        columns.extend(
            self.str_field_hash_map
@@ -349,6 +289,12 @@ impl ColumnarWriter {
        let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
        let mut symbol_byte_buffer: Vec<u8> = Vec::new();
        for (column_name, column_type, addr) in columns {
+            if column_name.contains(&JSON_END_OF_PATH) {
+                // Tantivy uses b'0' as a separator for nested fields in JSON.
+                // Column names with a b'0' are not simply ignored by the columnar (and the inverted
+                // index).
+                continue;
+            }
            match column_type {
                ColumnType::Bool => {
                    let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
@@ -358,11 +304,7 @@ impl ColumnarWriter {
                    serialize_bool_column(
                        cardinality,
                        num_docs,
-                        column_writer.operation_iterator(
-                            arena,
-                            old_to_new_row_ids,
-                            &mut symbol_byte_buffer,
-                        ),
+                        column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
                        buffers,
                        &mut column_serializer,
                    )?;
@@ -376,11 +318,7 @@ impl ColumnarWriter {
                    serialize_ip_addr_column(
                        cardinality,
                        num_docs,
-                        column_writer.operation_iterator(
-                            arena,
-                            old_to_new_row_ids,
-                            &mut symbol_byte_buffer,
-                        ),
+                        column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
                        buffers,
                        &mut column_serializer,
                    )?;
@@ -405,11 +343,8 @@ impl ColumnarWriter {
                        num_docs,
                        str_or_bytes_column_writer.sort_values_within_row,
                        dictionary_builder,
-                        str_or_bytes_column_writer.operation_iterator(
-                            arena,
-                            old_to_new_row_ids,
-                            &mut symbol_byte_buffer,
-                        ),
+                        str_or_bytes_column_writer
+                            .operation_iterator(arena, &mut symbol_byte_buffer),
                        buffers,
                        &self.arena,
                        &mut column_serializer,
@@ -427,11 +362,7 @@ impl ColumnarWriter {
                        cardinality,
                        num_docs,
                        numerical_type,
-                        numerical_column_writer.operation_iterator(
-                            arena,
-                            old_to_new_row_ids,
-                            &mut symbol_byte_buffer,
-                        ),
+                        numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
                        buffers,
                        &mut column_serializer,
                    )?;
@@ -446,11 +377,7 @@ impl ColumnarWriter {
                        cardinality,
                        num_docs,
                        NumericalType::I64,
-                        column_writer.operation_iterator(
-                            arena,
-                            old_to_new_row_ids,
-                            &mut symbol_byte_buffer,
-                        ),
+                        column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
                        buffers,
                        &mut column_serializer,
                    )?;
@@ -635,16 +562,16 @@ fn send_to_serialize_column_mappable_to_u128<
            let optional_index_builder = value_index_builders.borrow_optional_index_builder();
            consume_operation_iterator(op_iterator, optional_index_builder, values);
            let optional_index = optional_index_builder.finish(num_rows);
-            SerializableColumnIndex::Optional {
+            SerializableColumnIndex::Optional(SerializableOptionalIndex {
                num_rows,
                non_null_row_ids: Box::new(optional_index),
-            }
+            })
        }
        Cardinality::Multivalued => {
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
-            let multivalued_index = multivalued_index_builder.finish(num_rows);
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
+            SerializableColumnIndex::Multivalued(serializable_multivalued_index)
        }
    };
    crate::column::serialize_column_mappable_to_u128(
@@ -655,15 +582,6 @@ fn send_to_serialize_column_mappable_to_u128<
    Ok(())
 }

-fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
-    let mut start_index: usize = 0;
-    for end_index in multivalued_index.iter().copied() {
-        let end_index = end_index as usize;
-        values[start_index..end_index].sort_unstable();
-        start_index = end_index;
-    }
-}
-
 fn send_to_serialize_column_mappable_to_u64(
    op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
    cardinality: Cardinality,
@@ -687,19 +605,22 @@ fn send_to_serialize_column_mappable_to_u64(
            let optional_index_builder = value_index_builders.borrow_optional_index_builder();
            consume_operation_iterator(op_iterator, optional_index_builder, values);
            let optional_index = optional_index_builder.finish(num_rows);
-            SerializableColumnIndex::Optional {
+            SerializableColumnIndex::Optional(SerializableOptionalIndex {
                non_null_row_ids: Box::new(optional_index),
                num_rows,
-            }
+            })
        }
        Cardinality::Multivalued => {
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
-            let multivalued_index = multivalued_index_builder.finish(num_rows);
+            let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
            if sort_values_within_row {
-                sort_values_within_row_in_place(multivalued_index, values);
+                sort_values_within_row_in_place(
+                    serializable_multivalued_index.start_offsets.boxed_iter(),
+                    values,
+                );
            }
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued(serializable_multivalued_index)
        }
    };
    crate::column::serialize_column_mappable_to_u64(
@@ -710,6 +631,18 @@ fn send_to_serialize_column_mappable_to_u64(
    Ok(())
 }

+fn sort_values_within_row_in_place(
+    multivalued_index: impl Iterator<Item = RowId>,
+    values: &mut [u64],
+) {
+    let mut start_index: usize = 0;
+    for end_index in multivalued_index {
+        let end_index = end_index as usize;
+        values[start_index..end_index].sort_unstable();
+        start_index = end_index;
+    }
+}
+
 fn coerce_numerical_symbol<T>(
    operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
 ) -> impl Iterator<Item = ColumnOperation<u64>>
@@ -757,7 +690,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&arena, None, &mut buffer)
+            .operation_iterator(&arena, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 6);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -786,7 +719,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&arena, None, &mut buffer)
+            .operation_iterator(&arena, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 4);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
@@ -809,7 +742,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&arena, None, &mut buffer)
+            .operation_iterator(&arena, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 2);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -828,7 +761,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&arena, None, &mut buffer)
+            .operation_iterator(&arena, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 3);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
--- a/columnar/src/columnar/writer/serializer.rs
+++ b/columnar/src/columnar/writer/serializer.rs
@@ -1,6 +1,7 @@
 use std::io;
 use std::io::Write;

+use common::json_path_writer::JSON_END_OF_PATH;
 use common::{BinarySerializable, CountingWriter};
 use sstable::value::RangeValueWriter;
 use sstable::RangeSSTable;
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
 /// code.
 fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
    buffer.clear();
-    // Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
-    if key.contains(&0u8) {
-        buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
-    } else {
-        buffer.extend_from_slice(key);
-    }
-    buffer.push(0u8);
+    buffer.extend_from_slice(key);
+    buffer.push(JSON_END_OF_PATH);
    buffer.push(column_type.to_code());
 }

@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
        self.columnar_serializer.wrt.write_all(buf)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_prepare_key_bytes() {
-        let mut buffer: Vec<u8> = b"somegarbage".to_vec();
-        prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
-        assert_eq!(buffer.len(), 12);
-        assert_eq!(&buffer[..10], b"root0child");
-        assert_eq!(buffer[10], 0u8);
-        assert_eq!(buffer[11], ColumnType::Str.to_code());
-    }
-}
--- a/columnar/src/columnar/writer/value_index.rs
+++ b/columnar/src/columnar/writer/value_index.rs
@@ -1,3 +1,4 @@
+use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
 use crate::iterable::Iterable;
 use crate::RowId;

@@ -59,31 +60,47 @@ impl IndexBuilder for OptionalIndexBuilder {

 #[derive(Default)]
 pub struct MultivaluedIndexBuilder {
-    start_offsets: Vec<RowId>,
+    doc_with_values: Vec<RowId>,
+    start_offsets: Vec<u32>,
    total_num_vals_seen: u32,
+    current_row: RowId,
+    current_row_has_value: bool,
 }

 impl MultivaluedIndexBuilder {
-    pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
-        self.start_offsets
-            .resize(num_docs as usize + 1, self.total_num_vals_seen);
-        &self.start_offsets[..]
+    pub fn finish(&mut self, num_docs: RowId) -> SerializableMultivalueIndex<'_> {
+        self.start_offsets.push(self.total_num_vals_seen);
+        let non_null_row_ids: Box<dyn Iterable<RowId>> = Box::new(&self.doc_with_values[..]);
+        SerializableMultivalueIndex {
+            doc_ids_with_values: SerializableOptionalIndex {
+                non_null_row_ids,
+                num_rows: num_docs,
+            },
+            start_offsets: Box::new(&self.start_offsets[..]),
+        }
    }

    fn reset(&mut self) {
+        self.doc_with_values.clear();
        self.start_offsets.clear();
-        self.start_offsets.push(0u32);
        self.total_num_vals_seen = 0;
+        self.current_row = 0;
+        self.current_row_has_value = false;
    }
 }

 impl IndexBuilder for MultivaluedIndexBuilder {
    fn record_row(&mut self, row_id: RowId) {
-        self.start_offsets
-            .resize(row_id as usize + 1, self.total_num_vals_seen);
+        self.current_row = row_id;
+        self.current_row_has_value = false;
    }

    fn record_value(&mut self) {
+        if !self.current_row_has_value {
+            self.current_row_has_value = true;
+            self.doc_with_values.push(self.current_row);
+            self.start_offsets.push(self.total_num_vals_seen);
+        }
        self.total_num_vals_seen += 1;
    }
 }
@@ -141,6 +158,32 @@ mod tests {
        );
    }

+    #[test]
+    fn test_multivalued_value_index_builder_simple() {
+        let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
+        {
+            multivalued_value_index_builder.record_row(0u32);
+            multivalued_value_index_builder.record_value();
+            multivalued_value_index_builder.record_value();
+            let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
+            let start_offsets: Vec<u32> = serialized_multivalue_index
+                .start_offsets
+                .boxed_iter()
+                .collect();
+            assert_eq!(&start_offsets, &[0, 2]);
+        }
+        multivalued_value_index_builder.reset();
+        multivalued_value_index_builder.record_row(0u32);
+        multivalued_value_index_builder.record_value();
+        multivalued_value_index_builder.record_value();
+        let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
+        let start_offsets: Vec<u32> = serialized_multivalue_index
+            .start_offsets
+            .boxed_iter()
+            .collect();
+        assert_eq!(&start_offsets, &[0, 2]);
+    }
+
    #[test]
    fn test_multivalued_value_index_builder() {
        let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
@@ -149,17 +192,15 @@ mod tests {
        multivalued_value_index_builder.record_value();
        multivalued_value_index_builder.record_row(2u32);
        multivalued_value_index_builder.record_value();
-        assert_eq!(
-            multivalued_value_index_builder.finish(4u32).to_vec(),
-            vec![0, 0, 2, 3, 3]
-        );
-        multivalued_value_index_builder.reset();
-        multivalued_value_index_builder.record_row(2u32);
-        multivalued_value_index_builder.record_value();
-        multivalued_value_index_builder.record_value();
-        assert_eq!(
-            multivalued_value_index_builder.finish(4u32).to_vec(),
-            vec![0, 0, 0, 2, 2]
-        );
+        let SerializableMultivalueIndex {
+            doc_ids_with_values,
+            start_offsets,
+        } = multivalued_value_index_builder.finish(4u32);
+        assert_eq!(doc_ids_with_values.num_rows, 4u32);
+        let doc_ids_with_values: Vec<u32> =
+            doc_ids_with_values.non_null_row_ids.boxed_iter().collect();
+        assert_eq!(&doc_ids_with_values, &[1u32, 2u32]);
+        let start_offsets: Vec<u32> = start_offsets.boxed_iter().collect();
+        assert_eq!(&start_offsets[..], &[0, 2, 3]);
    }
 }
--- a/columnar/src/compat_tests.rs
+++ b/columnar/src/compat_tests.rs
@@ -0,0 +1,183 @@
+use std::path::PathBuf;
+
+use itertools::Itertools;
+
+use crate::{
+    merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
+    CURRENT_VERSION,
+};
+
+const NUM_DOCS: u32 = u16::MAX as u32;
+
+fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
+    use crate::ColumnarWriter;
+
+    let mut columnar_writer = ColumnarWriter::default();
+
+    for i in 0..num_docs {
+        if i % 100 == 0 {
+            columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
+        }
+        if i % 5 == 0 {
+            columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
+        }
+        columnar_writer.record_numerical(i, "full", value_offset + i as u64);
+        columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
+        columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
+    }
+
+    let mut wrt: Vec<u8> = Vec::new();
+    columnar_writer.serialize(num_docs, &mut wrt).unwrap();
+
+    wrt
+}
+
+#[test]
+/// Writes a columnar for the CURRENT_VERSION to disk.
+fn create_format() {
+    let version = CURRENT_VERSION.to_string();
+    let file_path = path_for_version(&version);
+    if PathBuf::from(file_path.clone()).exists() {
+        return;
+    }
+    let columnar = generate_columnar(NUM_DOCS, 0);
+    std::fs::write(file_path, columnar).unwrap();
+}
+
+fn path_for_version(version: &str) -> String {
+    format!("./compat_tests_data/{}.columnar", version)
+}
+
+#[test]
+fn test_format_v1() {
+    let path = path_for_version("v1");
+    test_format(&path);
+}
+
+#[test]
+fn test_format_v2() {
+    let path = path_for_version("v2");
+    test_format(&path);
+}
+
+fn test_format(path: &str) {
+    let file_content = std::fs::read(path).unwrap();
+    let reader = ColumnarReader::open(file_content).unwrap();
+
+    check_columns(&reader);
+
+    // Test merge
+    let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
+    let columnar_readers = vec![&reader, &reader2];
+    let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
+    let mut out = Vec::new();
+    merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
+    let reader = ColumnarReader::open(out).unwrap();
+    check_columns(&reader);
+}
+
+fn check_columns(reader: &ColumnarReader) {
+    let column = open_column(reader, "full");
+    check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
+    assert_eq!(column.get_cardinality(), Cardinality::Full);
+
+    let column = open_column(reader, "multi");
+    check_column(&column, |doc_id| {
+        vec![
+            (doc_id * 2, doc_id as u64).into(),
+            (doc_id * 2 + 1, doc_id as u64).into(),
+        ]
+    });
+    assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
+
+    let column = open_column(reader, "sparse");
+    check_column(&column, |doc_id| {
+        if doc_id % 100 == 0 {
+            vec![(doc_id / 100, doc_id as u64).into()]
+        } else {
+            vec![]
+        }
+    });
+    assert_eq!(column.get_cardinality(), Cardinality::Optional);
+
+    let column = open_column(reader, "dense");
+    check_column(&column, |doc_id| {
+        if doc_id % 5 == 0 {
+            vec![(doc_id / 5, doc_id as u64).into()]
+        } else {
+            vec![]
+        }
+    });
+    assert_eq!(column.get_cardinality(), Cardinality::Optional);
+}
+
+struct RowIdAndValue {
+    row_id: u32,
+    value: u64,
+}
+impl From<(u32, u64)> for RowIdAndValue {
+    fn from((row_id, value): (u32, u64)) -> Self {
+        Self { row_id, value }
+    }
+}
+
+fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
+    let num_docs = column.num_docs();
+    let test_doc = |doc: u32| {
+        if expected(doc).is_empty() {
+            assert_eq!(column.first(doc), None);
+        } else {
+            assert_eq!(column.first(doc), Some(expected(doc)[0].value));
+        }
+        let values = column.values_for_doc(doc).collect_vec();
+        assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
+        let mut row_ids = Vec::new();
+        column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
+        assert_eq!(
+            row_ids,
+            expected(doc).iter().map(|x| x.row_id).collect_vec()
+        );
+        let values = column.values_for_doc(doc).collect_vec();
+        assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
+
+        // Docid rowid conversion
+        let mut row_ids = Vec::new();
+        let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
+        column
+            .index
+            .docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
+        let expected_rowids = expected(doc)
+            .iter()
+            .map(|x| x.row_id)
+            .chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
+            .collect_vec();
+        assert_eq!(row_ids, expected_rowids);
+        let rowid_range = column
+            .index
+            .docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
+        if expected_rowids.is_empty() {
+            assert!(rowid_range.is_empty());
+        } else {
+            assert_eq!(
+                rowid_range,
+                expected_rowids[0]..expected_rowids.last().unwrap() + 1
+            );
+        }
+    };
+    test_doc(0);
+    test_doc(num_docs - 1);
+    test_doc(num_docs - 2);
+    test_doc(65000);
+}
+
+fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
+    let column = reader.read_columns(name).unwrap()[0]
+        .open()
+        .unwrap()
+        .coerce_numerical(crate::NumericalType::U64)
+        .unwrap();
+    let DynamicColumn::U64(column) = column else {
+        panic!();
+    };
+    column
+}
--- a/columnar/src/dynamic_column.rs
+++ b/columnar/src/dynamic_column.rs
@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
 use crate::column::{BytesColumn, Column, StrColumn};
 use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
 use crate::columnar::ColumnType;
-use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType};
+use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};

 #[derive(Clone)]
 pub enum DynamicColumn {
@@ -232,6 +232,7 @@ static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
 pub struct DynamicColumnHandle {
    pub(crate) file_slice: FileSlice,
    pub(crate) column_type: ColumnType,
+    pub(crate) format_version: Version,
 }

 impl DynamicColumnHandle {
@@ -260,11 +261,15 @@ impl DynamicColumnHandle {
        let column_bytes = self.file_slice.read_bytes()?;
        match self.column_type {
            ColumnType::Str | ColumnType::Bytes => {
-                let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
+                let column: BytesColumn =
+                    crate::column::open_column_bytes(column_bytes, self.format_version)?;
                Ok(Some(column.term_ord_column))
            }
            ColumnType::IpAddr => {
-                let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
+                let column = crate::column::open_column_u128_as_compact_u64(
+                    column_bytes,
+                    self.format_version,
+                )?;
                Ok(Some(column))
            }
            ColumnType::Bool
@@ -272,7 +277,8 @@ impl DynamicColumnHandle {
            | ColumnType::U64
            | ColumnType::F64
            | ColumnType::DateTime => {
-                let column = crate::column::open_column_u64::<u64>(column_bytes)?;
+                let column =
+                    crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
                Ok(Some(column))
            }
        }
@@ -280,15 +286,31 @@ impl DynamicColumnHandle {

    fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
        let dynamic_column: DynamicColumn = match self.column_type {
-            ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
-            ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
-            ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
-            ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
-            ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
-            ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
-            ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
+            ColumnType::Bytes => {
+                crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::Str => {
+                crate::column::open_column_str(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::I64 => {
+                crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::U64 => {
+                crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::F64 => {
+                crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::Bool => {
+                crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
+            }
+            ColumnType::IpAddr => {
+                crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
+                    .into()
+            }
            ColumnType::DateTime => {
-                crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
+                crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
+                    .into()
            }
        };
        Ok(dynamic_column)
--- a/columnar/src/iterable.rs
+++ b/columnar/src/iterable.rs
@@ -1,4 +1,7 @@
 use std::ops::Range;
+use std::sync::Arc;
+
+use crate::{ColumnValues, RowId};

 pub trait Iterable<T = u64> {
    fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
@@ -17,3 +20,9 @@ where Range<T>: Iterator<Item = T>
        Box::new(self.clone())
    }
 }
+
+impl Iterable for Arc<dyn crate::ColumnValues<RowId>> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
+        Box::new(self.iter().map(|row_id| row_id as u64))
+    }
+}
--- a/columnar/src/lib.rs
+++ b/columnar/src/lib.rs
@@ -48,7 +48,7 @@ pub use column_values::{
 };
 pub use columnar::{
    merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
-    MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
+    MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
 };
 use sstable::VoidSSTable;
 pub use value::{NumericalType, NumericalValue};
@@ -131,3 +131,6 @@ impl Cardinality {

 #[cfg(test)]
 mod tests;
+
+#[cfg(test)]
+mod compat_tests;
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -21,7 +21,7 @@ fn test_dataframe_writer_str() {
    dataframe_writer.record_str(1u32, "my_string", "hello");
    dataframe_writer.record_str(3u32, "my_string", "helloeee");
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
@@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() {
    dataframe_writer.record_bytes(1u32, "my_string", b"hello");
    dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
@@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() {
    dataframe_writer.record_bool(1u32, "bool.value", false);
    dataframe_writer.record_bool(3u32, "bool.value", true);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
@@ -74,12 +74,12 @@ fn test_dataframe_writer_u64_multivalued() {
    dataframe_writer.record_numerical(6u32, "divisor", 2u64);
    dataframe_writer.record_numerical(6u32, "divisor", 3u64);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(7, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(7, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
    assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 29);
+    assert_eq!(cols[0].num_bytes(), 50);
    let dyn_i64_col = cols[0].open().unwrap();
    let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
        panic!();
@@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
    dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
    dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
@@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() {
    dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
    dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(6, None, &mut buffer).unwrap();
+    dataframe_writer.serialize(6, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
@@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() {
    assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
 }

-#[test]
-fn test_dataframe_sort_by_full() {
-    let mut dataframe_writer = ColumnarWriter::default();
-    dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
-    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
-    let data = dataframe_writer.sort_order("value", 2, false);
-    assert_eq!(data, vec![0, 1]);
-}
-
-#[test]
-fn test_dataframe_sort_by_opt() {
-    let mut dataframe_writer = ColumnarWriter::default();
-    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
-    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
-    let data = dataframe_writer.sort_order("value", 5, false);
-    // 0, 2, 4 is 0.0
-    assert_eq!(data, vec![0, 2, 4, 3, 1]);
-    let data = dataframe_writer.sort_order("value", 5, true);
-    assert_eq!(
-        data,
-        vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
-    );
-}
-
-#[test]
-fn test_dataframe_sort_by_multi() {
-    let mut dataframe_writer = ColumnarWriter::default();
-    // valid for sort
-    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
-    // those are ignored for sort
-    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
-    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
-    // valid for sort
-    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
-    // ignored, would change sort order
-    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
-    let data = dataframe_writer.sort_order("value", 4, false);
-    assert_eq!(data, vec![0, 2, 1, 3]);
-}
-
 #[test]
 fn test_dictionary_encoded_str() {
    let mut buffer = Vec::new();
@@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() {
    columnar_writer.record_str(3, "my.column", "c");
    columnar_writer.record_str(3, "my.column2", "different_column!");
    columnar_writer.record_str(4, "my.column", "b");
-    columnar_writer.serialize(5, None, &mut buffer).unwrap();
+    columnar_writer.serialize(5, &mut buffer).unwrap();
    let columnar_reader = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar_reader.num_columns(), 2);
    let col_handles = columnar_reader.read_columns("my.column").unwrap();
@@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() {
    columnar_writer.record_bytes(3, "my.column", b"c");
    columnar_writer.record_bytes(3, "my.column2", b"different_column!");
    columnar_writer.record_bytes(4, "my.column", b"b");
-    columnar_writer.serialize(5, None, &mut buffer).unwrap();
+    columnar_writer.serialize(5, &mut buffer).unwrap();
    let columnar_reader = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar_reader.num_columns(), 2);
    let col_handles = columnar_reader.read_columns("my.column").unwrap();
@@ -344,7 +304,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
            ip_addr_byte
        ))),
        1 => any::<bool>().prop_map(ColumnValue::Bool),
-        1 => (0_679_723_993i64..1_679_723_995i64)
+        1 => (679_723_993i64..1_679_723_995i64)
            .prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
    ]
 }
@@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, Colu
        .prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
 }

-fn columnar_docs_and_mapping_strategy(
-) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
-    columnar_docs_strategy().prop_flat_map(|docs| {
-        permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
-    })
-}
-
-fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
-    Just((0u32..n as RowId).collect()).prop_shuffle()
-}
-
 fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
    let vals: Vec<usize> = (0..n).collect();
    subsequence(vals, 0..=n).prop_shuffle()
 }

-fn build_columnar_with_mapping(
-    docs: &[Vec<(&'static str, ColumnValue)>],
-    old_to_new_row_ids_opt: Option<&[RowId]>,
-) -> ColumnarReader {
+fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
    let num_docs = docs.len() as u32;
    let mut buffer = Vec::new();
    let mut columnar_writer = ColumnarWriter::default();
@@ -416,15 +362,13 @@ fn build_columnar_with_mapping(
            }
        }
    }
-    columnar_writer
-        .serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
-        .unwrap();
+    columnar_writer.serialize(num_docs, &mut buffer).unwrap();

    ColumnarReader::open(buffer).unwrap()
 }

 fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
-    build_columnar_with_mapping(docs, None)
+    build_columnar_with_mapping(docs)
 }

 fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
@@ -448,6 +392,7 @@ fn assert_columnar_eq(
    }
 }

+#[track_caller]
 fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
    left: &Column<T>,
    right: &Column<T>,
@@ -683,54 +628,6 @@ proptest! {
    }
 }

-// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
-proptest! {
-    #![proptest_config(ProptestConfig::with_cases(500))]
-    #[test]
-    fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
-        let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
-        assert_eq!(columnar.num_rows() as usize, docs.len());
-        let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
-        for (doc_id, doc_vals) in docs.iter().enumerate() {
-            for (col_name, col_val) in doc_vals {
-                expected_columns
-                    .entry((col_name, col_val.column_type_category()))
-                    .or_default()
-                    .entry(mapping[doc_id])
-                    .or_default()
-                    .push(col_val);
-            }
-        }
-        let column_list = columnar.list_columns().unwrap();
-        assert_eq!(expected_columns.len(), column_list.len());
-        for (column_name, column) in column_list {
-            let dynamic_column = column.open().unwrap();
-            let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
-            let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
-            for _doc_id in 0..columnar.num_rows() {
-                match &dynamic_column {
-                    DynamicColumn::Bool(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::I64(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::U64(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::F64(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::IpAddr(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::DateTime(col) =>
-                        assert_column_values(col, expected_col_values),
-                    DynamicColumn::Bytes(col) =>
-                        assert_bytes_column_values(col, expected_col_values, false),
-                    DynamicColumn::Str(col) =>
-                        assert_bytes_column_values(col, expected_col_values, true),
-                }
-            }
-        }
-    }
-}
-
 // This tests create 2 or 3 random small columnar and attempts to merge them.
 // It compares the resulting merged dataframe with what would have been obtained by building the
 // dataframe from the concatenated rows to begin with.
@@ -844,24 +741,68 @@ fn columnar_docs_and_remap(
 proptest! {
    #![proptest_config(ProptestConfig::with_cases(1000))]
    #[test]
-    fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
-        let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
-            .map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
-            .collect();
-        let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
-        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
-            .map(|docs| build_columnar(&docs[..]))
-            .collect::<Vec<_>>();
-        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-        let mut output: Vec<u8> = Vec::new();
-        let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
-        let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
-        crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
-        let merged_columnar = ColumnarReader::open(output).unwrap();
-        assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
+    fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in
+columnar_docs_and_remap()) {
+        test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
    }
 }

+fn test_columnar_merge_and_remap(
+    columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>,
+    shuffle_merge_order: Vec<RowAddr>,
+) {
+    let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order
+        .iter()
+        .map(|row_addr| {
+            columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone()
+        })
+        .collect();
+    let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
+    let columnar_readers: Vec<ColumnarReader> = columnar_docs
+        .iter()
+        .map(|docs| build_columnar(&docs[..]))
+        .collect::<Vec<_>>();
+    let columnar_readers_ref: Vec<&ColumnarReader> = columnar_readers.iter().collect();
+    let mut output: Vec<u8> = Vec::new();
+    let segment_num_rows: Vec<RowId> = columnar_docs
+        .iter()
+        .map(|docs| docs.len() as RowId)
+        .collect();
+    let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
+    crate::merge_columnar(
+        &columnar_readers_ref[..],
+        &[],
+        shuffle_merge_order.into(),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
+}
+
+#[test]
+fn test_columnar_merge_and_remap_bug_1() {
+    let columnar_docs = vec![vec![
+        vec![
+            ("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
+            ("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
+        ],
+        vec![],
+    ]];
+    let shuffle_merge_order: Vec<RowAddr> = vec![
+        RowAddr {
+            segment_ord: 0,
+            row_id: 1,
+        },
+        RowAddr {
+            segment_ord: 0,
+            row_id: 0,
+        },
+    ];
+
+    test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
+}
+
 #[test]
 fn test_columnar_merge_empty() {
    let columnar_reader_1 = build_columnar(&[]);
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
 homepage = "https://github.com/quickwit-oss/tantivy"
 repository = "https://github.com/quickwit-oss/tantivy"

-
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
@@ -20,5 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
 serde = { version = "1.0.136", features = ["derive"] }

 [dev-dependencies]
+binggan = "0.8.1"
 proptest = "1.0.0"
 rand = "0.8.4"
+
--- a/common/benches/bench.rs
+++ b/common/benches/bench.rs
@@ -1,39 +1,64 @@
-#![feature(test)]
+use binggan::{black_box, BenchRunner};
+use rand::seq::IteratorRandom;
+use rand::thread_rng;
+use tantivy_common::{serialize_vint_u32, BitSet, TinySet};

-extern crate test;
+fn bench_vint() {
+    let mut runner = BenchRunner::new();

-#[cfg(test)]
-mod tests {
-    use rand::seq::IteratorRandom;
-    use rand::thread_rng;
-    use tantivy_common::serialize_vint_u32;
-    use test::Bencher;
+    let vals: Vec<u32> = (0..20_000).collect();
+    runner.bench_function("bench_vint", move |_| {
+        let mut out = 0u64;
+        for val in vals.iter().cloned() {
+            let mut buf = [0u8; 8];
+            serialize_vint_u32(val, &mut buf);
+            out += u64::from(buf[0]);
+        }
+        black_box(out);
+    });

-    #[bench]
-    fn bench_vint(b: &mut Bencher) {
-        let vals: Vec<u32> = (0..20_000).collect();
-        b.iter(|| {
-            let mut out = 0u64;
-            for val in vals.iter().cloned() {
-                let mut buf = [0u8; 8];
-                serialize_vint_u32(val, &mut buf);
-                out += u64::from(buf[0]);
-            }
-            out
-        });
-    }
-
-    #[bench]
-    fn bench_vint_rand(b: &mut Bencher) {
-        let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
-        b.iter(|| {
-            let mut out = 0u64;
-            for val in vals.iter().cloned() {
-                let mut buf = [0u8; 8];
-                serialize_vint_u32(val, &mut buf);
-                out += u64::from(buf[0]);
-            }
-            out
-        });
-    }
+    let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
+    runner.bench_function("bench_vint_rand", move |_| {
+        let mut out = 0u64;
+        for val in vals.iter().cloned() {
+            let mut buf = [0u8; 8];
+            serialize_vint_u32(val, &mut buf);
+            out += u64::from(buf[0]);
+        }
+        black_box(out);
+    });
+}
+
+fn bench_bitset() {
+    let mut runner = BenchRunner::new();
+
+    runner.bench_function("bench_tinyset_pop", move |_| {
+        let mut tinyset = TinySet::singleton(black_box(31u32));
+        tinyset.pop_lowest();
+        tinyset.pop_lowest();
+        tinyset.pop_lowest();
+        tinyset.pop_lowest();
+        tinyset.pop_lowest();
+        tinyset.pop_lowest();
+        black_box(tinyset);
+    });
+
+    let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
+    runner.bench_function("bench_tinyset_sum", move |_| {
+        assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
+    });
+
+    let v = [10u32, 14u32, 21u32];
+    runner.bench_function("bench_tinyarr_sum", move |_| {
+        black_box(v.iter().cloned().sum::<u32>());
+    });
+
+    runner.bench_function("bench_bitset_initialize", move |_| {
+        black_box(BitSet::with_max_value(1_000_000));
+    });
+}
+
+fn main() {
+    bench_vint();
+    bench_bitset();
 }
--- a/common/src/bitset.rs
+++ b/common/src/bitset.rs
@@ -696,43 +696,3 @@ mod tests {
        }
    }
 }
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-
-    use test;
-
-    use super::{BitSet, TinySet};
-
-    #[bench]
-    fn bench_tinyset_pop(b: &mut test::Bencher) {
-        b.iter(|| {
-            let mut tinyset = TinySet::singleton(test::black_box(31u32));
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-        });
-    }
-
-    #[bench]
-    fn bench_tinyset_sum(b: &mut test::Bencher) {
-        let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
-        b.iter(|| {
-            assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
-        });
-    }
-
-    #[bench]
-    fn bench_tinyarr_sum(b: &mut test::Bencher) {
-        let v = [10u32, 14u32, 21u32];
-        b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
-    }
-
-    #[bench]
-    fn bench_bitset_initialize(b: &mut test::Bencher) {
-        b.iter(|| BitSet::with_max_value(1_000_000));
-    }
-}
--- a/doc/assets/images/paradedb.png
+++ b/doc/assets/images/paradedb.png
--- a/doc/src/index_sorting.md
+++ b/doc/src/index_sorting.md
@@ -7,6 +7,11 @@
    - [Other](#other)
  - [Usage](#usage)

+# Index Sorting has been removed!
+More infos here:
+
+https://github.com/quickwit-oss/tantivy/issues/2352
+
 # Index Sorting

 Tantivy allows you to sort the index according to a property.
--- a/examples/basic_search.rs
+++ b/examples/basic_search.rs
@@ -19,14 +19,13 @@ use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
 use tempfile::TempDir;

 fn main() -> tantivy::Result<()> {
-    // Normally you would use `MMapDirectory` instead to persist data on disk.
-    // https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html
-    // But for this example, we will use a temporary directory `TempDir`.
+    // Let's create a temporary directory for the
+    // sake of this example
    let index_path = TempDir::new()?;

    // # Defining the schema
    //
-    // The Tantivy index requires a schema.
+    // The Tantivy index requires a very strict schema.
    // The schema declares which fields are in the index,
    // and for each field, its type and "the way it should
    // be indexed".
--- a/examples/integer_range_search.rs
+++ b/examples/integer_range_search.rs
@@ -1,3 +1,5 @@
+use std::ops::Bound;
+
 // # Searching a range on an indexed int field.
 //
 // Below is an example of creating an indexed integer field in your schema
@@ -5,7 +7,7 @@
 use tantivy::collector::Count;
 use tantivy::query::RangeQuery;
 use tantivy::schema::{Schema, INDEXED};
-use tantivy::{doc, Index, IndexWriter, Result};
+use tantivy::{doc, Index, IndexWriter, Result, Term};

 fn main() -> Result<()> {
    // For the sake of simplicity, this schema will only have 1 field
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
    reader.reload()?;
    let searcher = reader.searcher();
    // The end is excluded i.e. here we are searching up to 1969
-    let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
+    let docs_in_the_sixties = RangeQuery::new(
+        Bound::Included(Term::from_field_u64(year_field, 1960)),
+        Bound::Excluded(Term::from_field_u64(year_field, 1970)),
+    );
    // Uses a Count collector to sum the total number of docs in the range
    let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
    assert_eq!(num_60s_books, 10);
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::iter::once;

 use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
 // special characters.
 const SPECIAL_CHARS: &[char] = &[
-    '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
+    '+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
 ];

 /// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
    )(inp)
 }

+const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
+
+fn interpret_escape(source: &str) -> String {
+    let mut res = String::with_capacity(source.len());
+    let mut in_escape = false;
+    let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
+
+    for c in source.chars() {
+        if in_escape {
+            if !require_escape(c) {
+                // we re-add the escape sequence
+                res.push('\\');
+            }
+            res.push(c);
+            in_escape = false;
+        } else if c == '\\' {
+            in_escape = true;
+        } else {
+            res.push(c);
+        }
+    }
+    res
+}
+
 /// Consume a word outside of any context.
 // TODO should support escape sequences
-fn word(inp: &str) -> IResult<&str, &str> {
+fn word(inp: &str) -> IResult<&str, Cow<str>> {
    map_res(
        recognize(tuple((
-            satisfy(|c| {
-                !c.is_whitespace()
-                    && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            }),
-            many0(satisfy(|c: char| {
-                !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            })),
+            alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
+            )),
+            many0(alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
+            ))),
        ))),
        |s| match s {
            "OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
-            _ => Ok(s),
+            s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
+            s => Ok(Cow::Borrowed(s)),
        },
    )(inp)
 }

-fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
-    |inp| {
-        opt_i_err(
-            preceded(
-                multispace0,
-                recognize(many1(satisfy(|c| {
-                    !c.is_whitespace() && !delimiter.contains(c)
-                }))),
+fn word_infallible(
+    delimiter: &str,
+    emit_error: bool,
+) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
+    // emit error is set when receiving an unescaped `:` should emit an error
+
+    move |inp| {
+        map(
+            opt_i_err(
+                preceded(
+                    multispace0,
+                    recognize(many1(alt((
+                        preceded(char::<&str, _>('\\'), anychar),
+                        satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
+                    )))),
+                ),
+                "expected word",
            ),
-            "expected word",
+            |(opt_s, mut errors)| match opt_s {
+                Some(s) => {
+                    if emit_error
+                        && (s
+                            .as_bytes()
+                            .windows(2)
+                            .any(|window| window[0] != b'\\' && window[1] == b':')
+                            || s.starts_with(':'))
+                    {
+                        errors.push(LenientErrorInternal {
+                            pos: inp.len(),
+                            message: "parsed possible invalid field as term".to_string(),
+                        });
+                    }
+                    if s.contains('\\') {
+                        (Some(Cow::Owned(interpret_escape(s))), errors)
+                    } else {
+                        (Some(Cow::Borrowed(s)), errors)
+                    }
+                }
+                None => (None, errors),
+            },
        )(inp)
    }
 }
@@ -159,7 +216,7 @@ fn simple_term_infallible(
                (value((), char('\'')), simple_quotes),
            ),
            // numbers are parsed with words in this case, as we allow string starting with a -
-            map(word_infallible(delimiter), |(text, errors)| {
+            map(word_infallible(delimiter, true), |(text, errors)| {
                (text.map(|text| (Delimiter::None, text.to_string())), errors)
            }),
        )(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
        |((field_name, _, leaf), mut errors)| {
            (
                leaf.map(|leaf| {
-                    if matches!(&leaf, UserInputLeaf::Literal(literal)
-                            if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
-                        && field_name.is_none()
-                    {
-                        errors.push(LenientErrorInternal {
-                            pos: inp.len(),
-                            message: "parsed possible invalid field as term".to_string(),
-                        });
-                    }
                    if matches!(&leaf, UserInputLeaf::Literal(literal)
                            if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
                        && field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
        tuple_infallible((
            opt_i(anychar),
            space0_infallible,
-            word_infallible("]}"),
+            word_infallible("]}", false),
            space1_infallible,
            opt_i_err(
                terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
                "missing keyword TO",
            ),
-            word_infallible("]}"),
+            word_infallible("]}", false),
            opt_i_err(one_of("]}"), "missing range delimiter"),
        )),
        |(
            (lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
            errs,
        )| {
-            let lower_bound = match (lower_bound_kind, lower) {
+            let lower_bound = match (lower_bound_kind, lower.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                // if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
                _ => unreachable!("precondition failed, range did not start with [ or {{"),
            };
-            let upper_bound = match (upper_bound_kind, upper) {
+            let upper_bound = match (upper_bound_kind, upper.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
            (
                (
                    value((), tag(">=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag(">")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
        test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");

        test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
+
+        test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
+        test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
+        test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
    }

    #[test]
@@ -1590,5 +1644,21 @@ mod test {
            r#"myfield:'hello\"happy\'tax'"#,
            r#""myfield":'hello"happy'tax'"#,
        );
+        // we don't process escape sequence for chars which don't require it
+        test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
+    }
+
+    #[test]
+    fn test_queries_with_colons() {
+        test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
+        test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
+        test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
+    }
+
+    #[test]
+    fn test_invalid_field() {
+        test_is_parse_err(r#"!bc:def"#, "!bc:def");
    }
 }
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -34,8 +34,9 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
-    PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
+    AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
+    MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
+    TopHitsAggregationReq,
 };

 /// The top-level aggregation request structure, which contains [`Aggregation`] and their user
@@ -146,6 +147,11 @@ pub enum AggregationVariants {
    /// extracted values.
    #[serde(rename = "stats")]
    Stats(StatsAggregation),
+    /// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
+    /// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
+    /// `std_deviation_sampling`) over the  extracted values.
+    #[serde(rename = "extended_stats")]
+    ExtendedStats(ExtendedStatsAggregation),
    /// Computes the sum of the extracted values.
    #[serde(rename = "sum")]
    Sum(SumAggregation),
@@ -154,7 +160,10 @@ pub enum AggregationVariants {
    Percentiles(PercentilesAggregationReq),
    /// Finds the top k values matching some order
    #[serde(rename = "top_hits")]
-    TopHits(TopHitsAggregation),
+    TopHits(TopHitsAggregationReq),
+    /// Computes an estimate of the number of unique values
+    #[serde(rename = "cardinality")]
+    Cardinality(CardinalityAggregationReq),
 }

 impl AggregationVariants {
@@ -170,9 +179,11 @@ impl AggregationVariants {
            AggregationVariants::Max(max) => vec![max.field_name()],
            AggregationVariants::Min(min) => vec![min.field_name()],
            AggregationVariants::Stats(stats) => vec![stats.field_name()],
+            AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
            AggregationVariants::Sum(sum) => vec![sum.field_name()],
            AggregationVariants::Percentiles(per) => vec![per.field_name()],
            AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
+            AggregationVariants::Cardinality(per) => vec![per.field_name()],
        }
    }

@@ -197,6 +208,12 @@ impl AggregationVariants {
            _ => None,
        }
    }
+    pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
+        match &self {
+            AggregationVariants::TopHits(top_hits) => Some(top_hits),
+            _ => None,
+        }
+    }

    pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
        match &self {
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -11,8 +11,8 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
-    SumAggregation,
+    AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
+    MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::VecWithNames;
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
                field: ref field_name,
                ref missing,
                ..
+            })
+            | Cardinality(CardinalityAggregationReq {
+                field: ref field_name,
+                ref missing,
+                ..
            }) => {
                let str_dict_column = reader.fast_fields().str(field_name)?;
                let allowed_column_types = [
@@ -276,6 +281,10 @@ impl AggregationWithAccessor {
                field: ref field_name,
                ..
            })
+            | ExtendedStats(ExtendedStatsAggregation {
+                field: ref field_name,
+                ..
+            })
            | Sum(SumAggregation {
                field: ref field_name,
                ..
--- a/src/aggregation/agg_result.rs
+++ b/src/aggregation/agg_result.rs
@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};

 use super::bucket::GetDocCount;
-use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
+use super::metric::{
+    ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
+};
 use super::{AggregationError, Key};
 use crate::TantivyError;

@@ -88,12 +90,16 @@ pub enum MetricResult {
    Min(SingleMetricResult),
    /// Stats metric result.
    Stats(Stats),
+    /// ExtendedStats metric result.
+    ExtendedStats(Box<ExtendedStats>),
    /// Sum metric result.
    Sum(SingleMetricResult),
    /// Percentiles metric result.
    Percentiles(PercentilesMetricResult),
    /// Top hits metric result
    TopHits(TopHitsMetricResult),
+    /// Cardinality metric result
+    Cardinality(SingleMetricResult),
 }

 impl MetricResult {
@@ -104,6 +110,7 @@ impl MetricResult {
            MetricResult::Max(max) => Ok(max.value),
            MetricResult::Min(min) => Ok(min.value),
            MetricResult::Stats(stats) => stats.get_value(agg_property),
+            MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
            MetricResult::Sum(sum) => Ok(sum.value),
            MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
                AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
@@ -111,6 +118,7 @@ impl MetricResult {
            MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
                AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
            )),
+            MetricResult::Cardinality(card) => Ok(card.value),
        }
    }
 }
--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
                }
            }
        }
+    },
+    "cardinality_string_id":{
+        "cardinality": {
+            "field": "string_id"
+        }
+    },
+    "cardinality_score":{
+        "cardinality": {
+            "field": "score"
+        }
    }
    });

@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
        )
    );

+    assert_eq!(res["cardinality_string_id"]["value"], 2.0);
+    assert_eq!(res["cardinality_score"]["value"], 80.0);
+
    Ok(())
 }

@@ -926,10 +939,10 @@ fn test_aggregation_on_json_object_mixed_types() {
          },
          "termagg": {
            "buckets": [
-              { "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
+              { "doc_count": 1, "key": 10.0, "key_as_string": "10", "min_price": { "value": 10.0 } },
              { "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
              { "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
-              { "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
+              { "doc_count": 1, "key": -20.5, "key_as_string": "-20.5", "min_price": { "value": -20.5 } },
              { "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
            ],
            "sum_other_doc_count": 0
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -1,10 +1,9 @@
 use std::fmt::Debug;
+use std::io;
 use std::net::Ipv6Addr;

 use columnar::column_values::CompactSpaceU64Accessor;
-use columnar::{
-    BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
-};
+use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};

@@ -466,49 +465,66 @@ impl SegmentTermCollector {
            };

        if self.column_type == ColumnType::Str {
+            let fallback_dict = Dictionary::empty();
            let term_dict = agg_with_accessor
                .str_dict_column
                .as_ref()
-                .cloned()
-                .unwrap_or_else(|| {
-                    StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
-                });
-            let mut buffer = String::new();
-            for (term_id, doc_count) in entries {
-                let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
-                // Special case for missing key
-                if term_id == u64::MAX {
-                    let missing_key = self
-                        .req
-                        .missing
-                        .as_ref()
-                        .expect("Found placeholder term_id but `missing` is None");
-                    match missing_key {
-                        Key::Str(missing) => {
-                            buffer.clear();
-                            buffer.push_str(missing);
-                            dict.insert(
-                                IntermediateKey::Str(buffer.to_string()),
-                                intermediate_entry,
-                            );
-                        }
-                        Key::F64(val) => {
-                            buffer.push_str(&val.to_string());
-                            dict.insert(IntermediateKey::F64(*val), intermediate_entry);
-                        }
+                .map(|el| el.dictionary())
+                .unwrap_or_else(|| &fallback_dict);
+            let mut buffer = Vec::new();
+
+            // special case for missing key
+            if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
+                let entry = entries[index];
+                let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
+                let missing_key = self
+                    .req
+                    .missing
+                    .as_ref()
+                    .expect("Found placeholder term_id but `missing` is None");
+                match missing_key {
+                    Key::Str(missing) => {
+                        buffer.clear();
+                        buffer.extend_from_slice(missing.as_bytes());
+                        dict.insert(
+                            IntermediateKey::Str(
+                                String::from_utf8(buffer.to_vec())
+                                    .expect("could not convert to String"),
+                            ),
+                            intermediate_entry,
+                        );
                    }
-                } else {
-                    if !term_dict.ord_to_str(term_id, &mut buffer)? {
-                        return Err(TantivyError::InternalError(format!(
-                            "Couldn't find term_id {term_id} in dict"
-                        )));
+                    Key::F64(val) => {
+                        dict.insert(IntermediateKey::F64(*val), intermediate_entry);
                    }
-                    dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
                }
+
+                entries.swap_remove(index);
            }
+
+            // Sort by term ord
+            entries.sort_unstable_by_key(|bucket| bucket.0);
+            let mut idx = 0;
+            term_dict.sorted_ords_to_term_cb(
+                entries.iter().map(|(term_id, _)| *term_id),
+                |term| {
+                    let entry = entries[idx];
+                    let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
+                        .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
+                    dict.insert(
+                        IntermediateKey::Str(
+                            String::from_utf8(term.to_vec()).expect("could not convert to String"),
+                        ),
+                        intermediate_entry,
+                    );
+                    idx += 1;
+                    Ok(())
+                },
+            )?;
+
            if self.req.min_doc_count == 0 {
                // TODO: Handle rev streaming for descending sorting by keys
-                let mut stream = term_dict.dictionary().stream()?;
+                let mut stream = term_dict.stream()?;
                let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
                    agg_with_accessor.agg.sub_aggregation(),
                );
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -19,13 +19,14 @@ use super::bucket::{
    GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
-    IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
+    IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
+    IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::{format_date, AggregationError, Key, SerializedKey};
 use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
 use crate::aggregation::bucket::TermsAggregationInternal;
+use crate::aggregation::metric::CardinalityCollector;
 use crate::TantivyError;

 /// Contains the intermediate aggregation result, which is optimized to be merged with other
@@ -215,6 +216,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
        Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
            IntermediateStats::default(),
        )),
+        ExtendedStats(_) => IntermediateAggregationResult::Metric(
+            IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
+        ),
        Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
            IntermediateSum::default(),
        )),
@@ -222,7 +226,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
            IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
        ),
        TopHits(ref req) => IntermediateAggregationResult::Metric(
-            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
+            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
+        ),
+        Cardinality(_) => IntermediateAggregationResult::Metric(
+            IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
        ),
    }
 }
@@ -282,10 +289,14 @@ pub enum IntermediateMetricResult {
    Min(IntermediateMin),
    /// Intermediate stats result.
    Stats(IntermediateStats),
+    /// Intermediate stats result.
+    ExtendedStats(IntermediateExtendedStats),
    /// Intermediate sum result.
    Sum(IntermediateSum),
    /// Intermediate top_hits result
    TopHits(TopHitsTopNComputer),
+    /// Intermediate cardinality result
+    Cardinality(CardinalityCollector),
 }

 impl IntermediateMetricResult {
@@ -306,6 +317,9 @@ impl IntermediateMetricResult {
            IntermediateMetricResult::Stats(intermediate_stats) => {
                MetricResult::Stats(intermediate_stats.finalize())
            }
+            IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
+                MetricResult::ExtendedStats(intermediate_stats.finalize())
+            }
            IntermediateMetricResult::Sum(intermediate_sum) => {
                MetricResult::Sum(intermediate_sum.finalize().into())
            }
@@ -316,6 +330,9 @@ impl IntermediateMetricResult {
            IntermediateMetricResult::TopHits(top_hits) => {
                MetricResult::TopHits(top_hits.into_final_result())
            }
+            IntermediateMetricResult::Cardinality(cardinality) => {
+                MetricResult::Cardinality(cardinality.finalize().into())
+            }
        }
    }

@@ -346,6 +363,12 @@ impl IntermediateMetricResult {
            ) => {
                stats_left.merge_fruits(stats_right);
            }
+            (
+                IntermediateMetricResult::ExtendedStats(extended_stats_left),
+                IntermediateMetricResult::ExtendedStats(extended_stats_right),
+            ) => {
+                extended_stats_left.merge_fruits(extended_stats_right);
+            }
            (IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
                sum_left.merge_fruits(sum_right);
            }
@@ -358,6 +381,12 @@ impl IntermediateMetricResult {
            (IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
                left.merge_fruits(right)?;
            }
+            (
+                IntermediateMetricResult::Cardinality(left),
+                IntermediateMetricResult::Cardinality(right),
+            ) => {
+                left.merge_fruits(right)?;
+            }
            _ => {
                panic!("incompatible fruit types in tree or missing merge_fruits handler");
            }
@@ -570,6 +599,7 @@ impl IntermediateTermBucketResult {
                        let val = if key { "true" } else { "false" };
                        Some(val.to_string())
                    }
+                    IntermediateKey::F64(val) => Some(val.to_string()),
                    _ => None,
                };
                Ok(BucketEntry {
--- a/src/aggregation/metric/cardinality.rs
+++ b/src/aggregation/metric/cardinality.rs
@@ -0,0 +1,466 @@
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{BuildHasher, Hasher};
+
+use columnar::column_values::CompactSpaceU64Accessor;
+use columnar::Dictionary;
+use common::f64_to_u64;
+use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
+use rustc_hash::FxHashSet;
+use serde::{Deserialize, Serialize};
+
+use crate::aggregation::agg_req_with_accessor::{
+    AggregationWithAccessor, AggregationsWithAccessor,
+};
+use crate::aggregation::intermediate_agg_result::{
+    IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
+};
+use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
+use crate::aggregation::*;
+use crate::TantivyError;
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+struct BuildSaltedHasher {
+    salt: u8,
+}
+
+impl BuildHasher for BuildSaltedHasher {
+    type Hasher = DefaultHasher;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        let mut hasher = DefaultHasher::new();
+        hasher.write_u8(self.salt);
+
+        hasher
+    }
+}
+
+/// # Cardinality
+///
+/// The cardinality aggregation allows for computing an estimate
+/// of the number of different values in a data set based on the
+/// HyperLogLog++ algorithm. This is particularly useful for understanding the
+/// uniqueness of values in a large dataset where counting each unique value
+/// individually would be computationally expensive.
+///
+/// For example, you might use a cardinality aggregation to estimate the number
+/// of unique visitors to a website by aggregating on a field that contains
+/// user IDs or session IDs.
+///
+/// To use the cardinality aggregation, you'll need to provide a field to
+/// aggregate on. The following example demonstrates a request for the cardinality
+/// of the "user_id" field:
+///
+/// ```JSON
+/// {
+///     "cardinality": {
+///         "field": "user_id"
+///     }
+/// }
+/// ```
+///
+/// This request will return an estimate of the number of unique values in the
+/// "user_id" field.
+///
+/// ## Missing Values
+///
+/// The `missing` parameter defines how documents that are missing a value should be treated.
+/// By default, documents without a value for the specified field are ignored. However, you can
+/// specify a default value for these documents using the `missing` parameter. This can be useful
+/// when you want to include documents with missing values in the aggregation.
+///
+/// For example, the following request treats documents with missing values in the "user_id"
+/// field as if they had a value of "unknown":
+///
+/// ```JSON
+/// {
+///     "cardinality": {
+///         "field": "user_id",
+///         "missing": "unknown"
+///     }
+/// }
+/// ```
+///
+/// # Estimation Accuracy
+///
+/// The cardinality aggregation provides an approximate count, which is usually
+/// accurate within a small error range. This trade-off allows for efficient
+/// computation even on very large datasets.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct CardinalityAggregationReq {
+    /// The field name to compute the percentiles on.
+    pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub missing: Option<Key>,
+}
+
+impl CardinalityAggregationReq {
+    /// Creates a new [`CardinalityAggregationReq`] instance from a field name.
+    pub fn from_field_name(field_name: String) -> Self {
+        Self {
+            field: field_name,
+            missing: None,
+        }
+    }
+    /// Returns the field name the aggregation is computed on.
+    pub fn field_name(&self) -> &str {
+        &self.field
+    }
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct SegmentCardinalityCollector {
+    cardinality: CardinalityCollector,
+    entries: FxHashSet<u64>,
+    column_type: ColumnType,
+    accessor_idx: usize,
+    missing: Option<Key>,
+}
+
+impl SegmentCardinalityCollector {
+    pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
+        Self {
+            cardinality: CardinalityCollector::new(column_type as u8),
+            entries: Default::default(),
+            column_type,
+            accessor_idx,
+            missing: missing.clone(),
+        }
+    }
+
+    fn fetch_block_with_field(
+        &mut self,
+        docs: &[crate::DocId],
+        agg_accessor: &mut AggregationWithAccessor,
+    ) {
+        if let Some(missing) = agg_accessor.missing_value_for_accessor {
+            agg_accessor.column_block_accessor.fetch_block_with_missing(
+                docs,
+                &agg_accessor.accessor,
+                missing,
+            );
+        } else {
+            agg_accessor
+                .column_block_accessor
+                .fetch_block(docs, &agg_accessor.accessor);
+        }
+    }
+
+    fn into_intermediate_metric_result(
+        mut self,
+        agg_with_accessor: &AggregationWithAccessor,
+    ) -> crate::Result<IntermediateMetricResult> {
+        if self.column_type == ColumnType::Str {
+            let fallback_dict = Dictionary::empty();
+            let dict = agg_with_accessor
+                .str_dict_column
+                .as_ref()
+                .map(|el| el.dictionary())
+                .unwrap_or_else(|| &fallback_dict);
+            let mut has_missing = false;
+
+            // TODO: replace FxHashSet with something that allows iterating in order
+            // (e.g. sparse bitvec)
+            let mut term_ids = Vec::new();
+            for term_ord in self.entries.into_iter() {
+                if term_ord == u64::MAX {
+                    has_missing = true;
+                } else {
+                    // we can reasonably exclude values above u32::MAX
+                    term_ids.push(term_ord as u32);
+                }
+            }
+            term_ids.sort_unstable();
+            dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
+                self.cardinality.sketch.insert_any(&term);
+                Ok(())
+            })?;
+            if has_missing {
+                let missing_key = self
+                    .missing
+                    .as_ref()
+                    .expect("Found placeholder term_ord but `missing` is None");
+                match missing_key {
+                    Key::Str(missing) => {
+                        self.cardinality.sketch.insert_any(&missing);
+                    }
+                    Key::F64(val) => {
+                        let val = f64_to_u64(*val);
+                        self.cardinality.sketch.insert_any(&val);
+                    }
+                }
+            }
+        }
+
+        Ok(IntermediateMetricResult::Cardinality(self.cardinality))
+    }
+}
+
+impl SegmentAggregationCollector for SegmentCardinalityCollector {
+    fn add_intermediate_aggregation_result(
+        self: Box<Self>,
+        agg_with_accessor: &AggregationsWithAccessor,
+        results: &mut IntermediateAggregationResults,
+    ) -> crate::Result<()> {
+        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
+        let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
+
+        let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
+        results.push(
+            name,
+            IntermediateAggregationResult::Metric(intermediate_result),
+        )?;
+
+        Ok(())
+    }
+
+    fn collect(
+        &mut self,
+        doc: crate::DocId,
+        agg_with_accessor: &mut AggregationsWithAccessor,
+    ) -> crate::Result<()> {
+        self.collect_block(&[doc], agg_with_accessor)
+    }
+
+    fn collect_block(
+        &mut self,
+        docs: &[crate::DocId],
+        agg_with_accessor: &mut AggregationsWithAccessor,
+    ) -> crate::Result<()> {
+        let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
+        self.fetch_block_with_field(docs, bucket_agg_accessor);
+
+        let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
+        if self.column_type == ColumnType::Str {
+            for term_ord in col_block_accessor.iter_vals() {
+                self.entries.insert(term_ord);
+            }
+        } else if self.column_type == ColumnType::IpAddr {
+            let compact_space_accessor = bucket_agg_accessor
+                .accessor
+                .values
+                .clone()
+                .downcast_arc::<CompactSpaceU64Accessor>()
+                .map_err(|_| {
+                    TantivyError::AggregationError(
+                        crate::aggregation::AggregationError::InternalError(
+                            "Type mismatch: Could not downcast to CompactSpaceU64Accessor"
+                                .to_string(),
+                        ),
+                    )
+                })?;
+            for val in col_block_accessor.iter_vals() {
+                let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
+                self.cardinality.sketch.insert_any(&val);
+            }
+        } else {
+            for val in col_block_accessor.iter_vals() {
+                self.cardinality.sketch.insert_any(&val);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+/// The percentiles collector used during segment collection and for merging results.
+pub struct CardinalityCollector {
+    sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
+}
+impl Default for CardinalityCollector {
+    fn default() -> Self {
+        Self::new(0)
+    }
+}
+
+impl PartialEq for CardinalityCollector {
+    fn eq(&self, _other: &Self) -> bool {
+        false
+    }
+}
+
+impl CardinalityCollector {
+    /// Compute the final cardinality estimate.
+    pub fn finalize(self) -> Option<f64> {
+        Some(self.sketch.clone().count().trunc())
+    }
+
+    fn new(salt: u8) -> Self {
+        Self {
+            sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
+        }
+    }
+
+    pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
+        self.sketch.merge(&right.sketch).map_err(|err| {
+            TantivyError::AggregationError(AggregationError::InternalError(format!(
+                "Error while merging cardinality {err:?}"
+            )))
+        })?;
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use std::net::IpAddr;
+    use std::str::FromStr;
+
+    use columnar::MonotonicallyMappableToU64;
+
+    use crate::aggregation::agg_req::Aggregations;
+    use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
+    use crate::schema::{IntoIpv6Addr, Schema, FAST};
+    use crate::Index;
+
+    #[test]
+    fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
+        let values = vec![];
+        let index = get_test_index_from_terms(false, &values)?;
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "cardinality": {
+                "cardinality": {
+                    "field": "string_id",
+                }
+            },
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        assert_eq!(res["cardinality"]["value"], 0.0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
+        cardinality_aggregation_test_merge_segment(true)
+    }
+    #[test]
+    fn cardinality_aggregation_test() -> crate::Result<()> {
+        cardinality_aggregation_test_merge_segment(false)
+    }
+    fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
+        let segment_and_terms = vec![
+            vec!["terma"],
+            vec!["termb"],
+            vec!["termc"],
+            vec!["terma"],
+            vec!["terma"],
+            vec!["terma"],
+            vec!["termb"],
+            vec!["terma"],
+        ];
+        let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "cardinality": {
+                "cardinality": {
+                    "field": "string_id",
+                }
+            },
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        assert_eq!(res["cardinality"]["value"], 3.0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_aggregation_u64() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let id_field = schema_builder.add_u64_field("id", FAST);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut writer = index.writer_for_tests()?;
+            writer.add_document(doc!(id_field => 1u64))?;
+            writer.add_document(doc!(id_field => 2u64))?;
+            writer.add_document(doc!(id_field => 3u64))?;
+            writer.add_document(doc!())?;
+            writer.commit()?;
+        }
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "cardinality": {
+                "cardinality": {
+                    "field": "id",
+                    "missing": 0u64
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        assert_eq!(res["cardinality"]["value"], 4.0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let field = schema_builder.add_ip_addr_field("ip_field", FAST);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut writer = index.writer_for_tests()?;
+            // IpV6 loopback
+            writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
+            writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
+            // IpV4
+            writer.add_document(
+                doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
+            )?;
+            writer.commit()?;
+        }
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "cardinality": {
+                "cardinality": {
+                    "field": "ip_field"
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        assert_eq!(res["cardinality"]["value"], 2.0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_aggregation_json() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let field = schema_builder.add_json_field("json", FAST);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut writer = index.writer_for_tests()?;
+            writer.add_document(doc!(field => json!({"value": false})))?;
+            writer.add_document(doc!(field => json!({"value": true})))?;
+            writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
+            writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
+            writer.commit()?;
+        }
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "cardinality": {
+                "cardinality": {
+                    "field": "json.value"
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        assert_eq!(res["cardinality"]["value"], 4.0);
+
+        Ok(())
+    }
+}
--- a/src/aggregation/metric/extended_stats.rs
+++ b/src/aggregation/metric/extended_stats.rs
--- a/src/aggregation/metric/mod.rs
+++ b/src/aggregation/metric/mod.rs
@@ -17,7 +17,9 @@
 //! - [Percentiles](PercentilesAggregationReq)

 mod average;
+mod cardinality;
 mod count;
+mod extended_stats;
 mod max;
 mod min;
 mod percentiles;
@@ -28,7 +30,9 @@ mod top_hits;
 use std::collections::HashMap;

 pub use average::*;
+pub use cardinality::*;
 pub use count::*;
+pub use extended_stats::*;
 pub use max::*;
 pub use min::*;
 pub use percentiles::*;
--- a/src/aggregation/metric/stats.rs
+++ b/src/aggregation/metric/stats.rs
@@ -1,3 +1,5 @@
+use std::fmt::Debug;
+
 use serde::{Deserialize, Serialize};

 use super::*;
@@ -85,13 +87,15 @@ impl Stats {
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateStats {
    /// The number of extracted values.
-    count: u64,
+    pub(crate) count: u64,
    /// The sum of the extracted values.
-    sum: f64,
+    pub(crate) sum: f64,
+    /// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+    pub(crate) delta: f64,
    /// The min value.
-    min: f64,
+    pub(crate) min: f64,
    /// The max value.
-    max: f64,
+    pub(crate) max: f64,
 }

 impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
        Self {
            count: 0,
            sum: 0.0,
+            delta: 0.0,
            min: f64::MAX,
            max: f64::MIN,
        }
@@ -109,7 +114,13 @@ impl IntermediateStats {
    /// Merges the other stats intermediate result into self.
    pub fn merge_fruits(&mut self, other: IntermediateStats) {
        self.count += other.count;
-        self.sum += other.sum;
+
+        // kahan algorithm for sum
+        let y = other.sum - (self.delta + other.delta);
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(other.min);
        self.max = self.max.max(other.max);
    }
@@ -141,9 +152,15 @@ impl IntermediateStats {
    }

    #[inline]
-    fn collect(&mut self, value: f64) {
+    pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
        self.count += 1;
-        self.sum += value;
+
+        // kahan algorithm for sum
+        let y = value - self.delta;
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(value);
        self.max = self.max.max(value);
    }
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {

 #[cfg(test)]
 mod tests {
-
    use serde_json::Value;

    use crate::aggregation::agg_req::{Aggregation, Aggregations};
--- a/src/aggregation/metric/top_hits.rs
+++ b/src/aggregation/metric/top_hits.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 use std::net::Ipv6Addr;

-use columnar::{ColumnarReader, DynamicColumn};
+use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
 use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
 use common::DateTime;
 use regex::Regex;
@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
 /// }
 /// ```
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
-pub struct TopHitsAggregation {
+pub struct TopHitsAggregationReq {
    sort: Vec<KeyOrder>,
    size: usize,
    from: Option<usize>,
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
    ))
 }

-impl TopHitsAggregation {
+impl TopHitsAggregationReq {
    /// Validate and resolve field retrieval parameters
    pub fn validate_and_resolve_field_names(
        &mut self,
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
 /// The TopHitsCollector used for collecting over segments and merging results.
 #[derive(Clone, Serialize, Deserialize, Debug)]
 pub struct TopHitsTopNComputer {
-    req: TopHitsAggregation,
+    req: TopHitsAggregationReq,
    top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
 }

@@ -443,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {

 impl TopHitsTopNComputer {
    /// Create a new TopHitsCollector
-    pub fn new(req: TopHitsAggregation) -> Self {
+    pub fn new(req: &TopHitsAggregationReq) -> Self {
        Self {
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
-            req,
+            req: req.clone(),
        }
    }

@@ -491,18 +491,16 @@ impl TopHitsTopNComputer {
 pub(crate) struct TopHitsSegmentCollector {
    segment_ordinal: SegmentOrdinal,
    accessor_idx: usize,
-    req: TopHitsAggregation,
    top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
 }

 impl TopHitsSegmentCollector {
    pub fn from_req(
-        req: &TopHitsAggregation,
+        req: &TopHitsAggregationReq,
        accessor_idx: usize,
        segment_ordinal: SegmentOrdinal,
    ) -> Self {
        Self {
-            req: req.clone(),
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
            segment_ordinal,
            accessor_idx,
@@ -511,14 +509,13 @@ impl TopHitsSegmentCollector {
    fn into_top_hits_collector(
        self,
        value_accessors: &HashMap<String, Vec<DynamicColumn>>,
+        req: &TopHitsAggregationReq,
    ) -> TopHitsTopNComputer {
-        let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
+        let mut top_hits_computer = TopHitsTopNComputer::new(req);
        let top_results = self.top_n.into_vec();

        for res in top_results {
-            let doc_value_fields = self
-                .req
-                .get_document_field_data(value_accessors, res.doc.doc_id);
+            let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
            top_hits_computer.collect(
                DocSortValuesAndFields {
                    sorts: res.feature,
@@ -530,34 +527,15 @@ impl TopHitsSegmentCollector {

        top_hits_computer
    }
-}

-impl SegmentAggregationCollector for TopHitsSegmentCollector {
-    fn add_intermediate_aggregation_result(
-        self: Box<Self>,
-        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
-        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
-    ) -> crate::Result<()> {
-        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
-
-        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
-
-        let intermediate_result =
-            IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
-        results.push(
-            name,
-            IntermediateAggregationResult::Metric(intermediate_result),
-        )
-    }
-
-    fn collect(
+    /// TODO add a specialized variant for a single sort field
+    fn collect_with(
        &mut self,
        doc_id: crate::DocId,
-        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        req: &TopHitsAggregationReq,
+        accessors: &[(Column<u64>, ColumnType)],
    ) -> crate::Result<()> {
-        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
-        let sorts: Vec<DocValueAndOrder> = self
-            .req
+        let sorts: Vec<DocValueAndOrder> = req
            .sort
            .iter()
            .enumerate()
@@ -582,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
        );
        Ok(())
    }
+}
+
+impl SegmentAggregationCollector for TopHitsSegmentCollector {
+    fn add_intermediate_aggregation_result(
+        self: Box<Self>,
+        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
+    ) -> crate::Result<()> {
+        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
+
+        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+
+        let intermediate_result = IntermediateMetricResult::TopHits(
+            self.into_top_hits_collector(value_accessors, tophits_req),
+        );
+        results.push(
+            name,
+            IntermediateAggregationResult::Metric(intermediate_result),
+        )
+    }
+
+    /// TODO: Consider a caching layer to reduce the call overhead
+    fn collect(
+        &mut self,
+        doc_id: crate::DocId,
+        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
+        self.collect_with(doc_id, tophits_req, accessors)?;
+        Ok(())
+    }

    fn collect_block(
        &mut self,
        docs: &[crate::DocId],
        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
        // TODO: Consider getting fields with the column block accessor.
        for doc in docs {
-            self.collect(*doc, agg_with_accessor)?;
+            self.collect_with(*doc, tophits_req, accessors)?;
        }
        Ok(())
    }
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -44,11 +44,14 @@
 //! - [Metric](metric)
 //!     - [Average](metric::AverageAggregation)
 //!     - [Stats](metric::StatsAggregation)
+//!     - [ExtendedStats](metric::ExtendedStatsAggregation)
 //!     - [Min](metric::MinAggregation)
 //!     - [Max](metric::MaxAggregation)
 //!     - [Sum](metric::SumAggregation)
 //!     - [Count](metric::CountAggregation)
 //!     - [Percentiles](metric::PercentilesAggregationReq)
+//!     - [Cardinality](metric::CardinalityAggregationReq)
+//!     - [TopHits](metric::TopHitsAggregationReq)
 //!
 //! # Example
 //! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
--- a/src/aggregation/segment_agg_result.rs
+++ b/src/aggregation/segment_agg_result.rs
@@ -11,12 +11,15 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
 use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
 use super::intermediate_agg_result::IntermediateAggregationResults;
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
    SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
    SumAggregation,
 };
 use crate::aggregation::bucket::TermMissingAgg;
-use crate::aggregation::metric::TopHitsSegmentCollector;
+use crate::aggregation::metric::{
+    CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
+    TopHitsSegmentCollector,
+};

 pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
    fn add_intermediate_aggregation_result(
@@ -148,6 +151,9 @@ pub(crate) fn build_single_agg_segment_collector(
            accessor_idx,
            *missing,
        ))),
+        ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
+            SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
+        )),
        Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Sum,
@@ -166,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
            accessor_idx,
            req.segment_ordinal,
        ))),
+        Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
+            SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
+        )),
    }
 }

--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -871,7 +871,10 @@ mod tests {
    use crate::schema::{Field, Schema, FAST, STORED, TEXT};
    use crate::time::format_description::well_known::Rfc3339;
    use crate::time::OffsetDateTime;
-    use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
+    use crate::{
+        assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
+        SegmentReader,
+    };

    fn make_index() -> crate::Result<Index> {
        let mut schema_builder = Schema::builder();
--- a/src/core/executor.rs
+++ b/src/core/executor.rs
@@ -195,7 +195,7 @@ mod tests {
        let (tx, rx) = crossbeam_channel::bounded::<()>(0);
        let rx = Arc::new(rx);
        let executor = Executor::multi_thread(3, "search-test").unwrap();
-        for i in 0..1000 {
+        for _ in 0..1000 {
            let counter_clone: Arc<AtomicU64> = counter.clone();
            let other_counter_clone: Arc<AtomicU64> = other_counter.clone();

@@ -203,18 +203,18 @@ mod tests {
            let rx_clone2 = rx.clone();
            let fut = executor.spawn_blocking(move || {
                counter_clone.fetch_add(1, Ordering::SeqCst);
-                let () = rx_clone.recv().unwrap();
+                let _ = rx_clone.recv();
            });
            futures.push(fut);
            let other_fut = executor.spawn_blocking(move || {
                other_counter_clone.fetch_add(1, Ordering::SeqCst);
-                let () = rx_clone2.recv().unwrap();
+                let _ = rx_clone2.recv();
            });
            other_futures.push(other_fut);
        }

        // We execute 100 futures.
-        for i in 0..100 {
+        for _ in 0..100 {
            tx.send(()).unwrap();
        }

@@ -226,7 +226,7 @@ mod tests {
        drop(other_futures);

        // We execute 100 futures.
-        for i in 0..100 {
+        for _ in 0..100 {
            tx.send(()).unwrap();
        }

--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -1,4 +1,4 @@
-use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
+use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
 use common::{replace_in_place, JsonPathWriter};
 use rustc_hash::FxHashMap;

@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
    positions_per_path: &mut IndexingPositionsPerPath,
 ) {
    for (json_path_segment, json_value_visitor) in json_visitor {
+        if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
+            continue;
+        }
        json_path_writer.push(json_path_segment);
        index_json_value(
            doc,
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -127,7 +127,7 @@ mod tests {
            fast_field_writers
                .add_document(&doc!(*FIELD=>2u64))
                .unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -178,7 +178,7 @@ mod tests {
            fast_field_writers
                .add_document(&doc!(*FIELD=>215u64))
                .unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -211,7 +211,7 @@ mod tests {
                    .add_document(&doc!(*FIELD=>100_000u64))
                    .unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -243,7 +243,7 @@ mod tests {
                    .add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id))
                    .unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -276,7 +276,7 @@ mod tests {
                doc.add_i64(i64_field, i);
                fast_field_writers.add_document(&doc).unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -315,7 +315,7 @@ mod tests {
            let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
            let doc = TantivyDocument::default();
            fast_field_writers.add_document(&doc).unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }

@@ -348,7 +348,7 @@ mod tests {
            let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
            let doc = TantivyDocument::default();
            fast_field_writers.add_document(&doc).unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }

@@ -385,7 +385,7 @@ mod tests {
            for &x in &permutation {
                fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -770,7 +770,7 @@ mod tests {
            fast_field_writers
                .add_document(&doc!(field=>false))
                .unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -802,7 +802,7 @@ mod tests {
                    .add_document(&doc!(field=>false))
                    .unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -827,7 +827,7 @@ mod tests {
            let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
            let doc = TantivyDocument::default();
            fast_field_writers.add_document(&doc).unwrap();
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
@@ -855,7 +855,7 @@ mod tests {
            for doc in docs {
                fast_field_writers.add_document(doc).unwrap();
            }
-            fast_field_writers.serialize(&mut write, None).unwrap();
+            fast_field_writers.serialize(&mut write).unwrap();
            write.terminate().unwrap();
        }
        Ok(directory)
--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -4,7 +4,6 @@ use columnar::{ColumnarWriter, NumericalValue};
 use common::{DateTimePrecision, JsonPathWriter};
 use tokenizer_api::Token;

-use crate::indexer::doc_id_mapping::DocIdMapping;
 use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
 use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -106,16 +105,6 @@ impl FastFieldsWriter {
        self.columnar_writer.mem_usage()
    }

-    pub(crate) fn sort_order(
-        &self,
-        sort_field: &str,
-        num_docs: DocId,
-        reversed: bool,
-    ) -> Vec<DocId> {
-        self.columnar_writer
-            .sort_order(sort_field, num_docs, reversed)
-    }
-
    /// Indexes all of the fastfields of a new document.
    pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
        let doc_id = self.num_docs;
@@ -233,16 +222,9 @@ impl FastFieldsWriter {

    /// Serializes all of the `FastFieldWriter`s by pushing them in
    /// order to the fast field serializer.
-    pub fn serialize(
-        mut self,
-        wrt: &mut dyn io::Write,
-        doc_id_map_opt: Option<&DocIdMapping>,
-    ) -> io::Result<()> {
+    pub fn serialize(mut self, wrt: &mut dyn io::Write) -> io::Result<()> {
        let num_docs = self.num_docs;
-        let old_to_new_row_ids =
-            doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
-        self.columnar_writer
-            .serialize(num_docs, old_to_new_row_ids, wrt)?;
+        self.columnar_writer.serialize(num_docs, wrt)?;
        Ok(())
    }
 }
@@ -392,7 +374,7 @@ mod tests {
        }
        let mut buffer = Vec::new();
        columnar_writer
-            .serialize(json_docs.len() as DocId, None, &mut buffer)
+            .serialize(json_docs.len() as DocId, &mut buffer)
            .unwrap();
        ColumnarReader::open(buffer).unwrap()
    }
--- a/src/fieldnorm/mod.rs
+++ b/src/fieldnorm/mod.rs
@@ -77,7 +77,7 @@ mod tests {
            let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
            fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
            fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
-            fieldnorm_writers.serialize(serializer, None)?;
+            fieldnorm_writers.serialize(serializer)?;
        }
        let file = directory.open_read(path)?;
        {
--- a/src/fieldnorm/writer.rs
+++ b/src/fieldnorm/writer.rs
@@ -2,7 +2,6 @@ use std::cmp::Ordering;
 use std::{io, iter};

 use super::{fieldnorm_to_id, FieldNormsSerializer};
-use crate::indexer::doc_id_mapping::DocIdMapping;
 use crate::schema::{Field, Schema};
 use crate::DocId;

@@ -92,11 +91,7 @@ impl FieldNormsWriter {
    }

    /// Serialize the seen fieldnorm values to the serializer for all fields.
-    pub fn serialize(
-        &self,
-        mut fieldnorms_serializer: FieldNormsSerializer,
-        doc_id_map: Option<&DocIdMapping>,
-    ) -> io::Result<()> {
+    pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
        for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
            |(field_id, fieldnorms_buffer_opt)| {
                fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
@@ -104,12 +99,7 @@ impl FieldNormsWriter {
                })
            },
        ) {
-            if let Some(doc_id_map) = doc_id_map {
-                let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
-                fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
-            } else {
-                fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
-            }
+            fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
        }
        fieldnorms_serializer.close()?;
        Ok(())
--- a/src/functional_test.rs
+++ b/src/functional_test.rs
@@ -7,7 +7,7 @@ use rand::{thread_rng, Rng};
 use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
 use crate::schema::*;
 #[allow(deprecated)]
-use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
+use crate::{doc, schema, Index, IndexWriter, Searcher};

 fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
    assert!(searcher.segment_readers().len() < 20);
@@ -65,71 +65,6 @@ fn get_num_iterations() -> usize {
        .map(|str| str.parse().unwrap())
        .unwrap_or(2000)
 }
-#[test]
-#[ignore]
-fn test_functional_indexing_sorted() -> crate::Result<()> {
-    let mut schema_builder = Schema::builder();
-
-    let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
-    let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
-    let text_field_options = TextOptions::default()
-        .set_indexing_options(
-            TextFieldIndexing::default()
-                .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
-        )
-        .set_stored();
-    let text_field = schema_builder.add_text_field("text_field", text_field_options);
-    let schema = schema_builder.build();
-
-    let mut index_builder = Index::builder().schema(schema);
-    index_builder = index_builder.settings(IndexSettings {
-        sort_by_field: Some(IndexSortByField {
-            field: "id".to_string(),
-            order: Order::Desc,
-        }),
-        ..Default::default()
-    });
-    let index = index_builder.create_from_tempdir().unwrap();
-
-    let reader = index.reader()?;
-
-    let mut rng = thread_rng();
-
-    let mut index_writer: IndexWriter =
-        index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
-
-    let mut committed_docs: HashSet<u64> = HashSet::new();
-    let mut uncommitted_docs: HashSet<u64> = HashSet::new();
-
-    for _ in 0..get_num_iterations() {
-        let random_val = rng.gen_range(0..20);
-        if random_val == 0 {
-            index_writer.commit()?;
-            committed_docs.extend(&uncommitted_docs);
-            uncommitted_docs.clear();
-            reader.reload()?;
-            let searcher = reader.searcher();
-            // check that everything is correct.
-            check_index_content(
-                &searcher,
-                &committed_docs.iter().cloned().collect::<Vec<u64>>(),
-            )?;
-        } else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
-            let doc_id_term = Term::from_field_u64(id_field, random_val);
-            index_writer.delete_term(doc_id_term);
-        } else {
-            uncommitted_docs.insert(random_val);
-            let mut doc = TantivyDocument::new();
-            doc.add_u64(id_field, random_val);
-            for i in 1u64..10u64 {
-                doc.add_u64(multiples_field, random_val * i);
-            }
-            doc.add_text(text_field, get_text());
-            index_writer.add_document(doc)?;
-        }
-    }
-    Ok(())
-}

 const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
                     tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
--- a/src/index/index.rs
+++ b/src/index/index.rs
@@ -20,7 +20,7 @@ use crate::indexer::segment_updater::save_metas;
 use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
 use crate::reader::{IndexReader, IndexReaderBuilder};
 use crate::schema::document::Document;
-use crate::schema::{Field, FieldType, Schema, Type};
+use crate::schema::{Field, FieldType, Schema};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
 use crate::SegmentReader;

@@ -232,31 +232,7 @@ impl IndexBuilder {
    }

    fn validate(&self) -> crate::Result<()> {
-        if let Some(schema) = self.schema.as_ref() {
-            if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() {
-                let schema_field = schema.get_field(&sort_by_field.field).map_err(|_| {
-                    TantivyError::InvalidArgument(format!(
-                        "Field to sort index {} not found in schema",
-                        sort_by_field.field
-                    ))
-                })?;
-                let entry = schema.get_field_entry(schema_field);
-                if !entry.is_fast() {
-                    return Err(TantivyError::InvalidArgument(format!(
-                        "Field {} is no fast field. Field needs to be a single value fast field \
-                         to be used to sort an index",
-                        sort_by_field.field
-                    )));
-                }
-                let supported_field_types = [Type::I64, Type::U64, Type::F64, Type::Date];
-                let field_type = entry.field_type().value_type();
-                if !supported_field_types.contains(&field_type) {
-                    return Err(TantivyError::InvalidArgument(format!(
-                        "Unsupported field type in sort_by_field: {field_type:?}. Supported field \
-                         types: {supported_field_types:?} ",
-                    )));
-                }
-            }
+        if let Some(_schema) = self.schema.as_ref() {
            Ok(())
        } else {
            Err(TantivyError::InvalidArgument(
--- a/src/index/index_meta.rs
+++ b/src/index/index_meta.rs
@@ -249,10 +249,6 @@ fn is_true(val: &bool) -> bool {
 /// index, like presort documents.
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub struct IndexSettings {
-    /// Sorts the documents by information
-    /// provided in `IndexSortByField`
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub sort_by_field: Option<IndexSortByField>,
    /// The `Compressor` used to compress the doc store.
    #[serde(default)]
    pub docstore_compression: Compressor,
@@ -275,7 +271,6 @@ fn default_docstore_blocksize() -> usize {
 impl Default for IndexSettings {
    fn default() -> Self {
        Self {
-            sort_by_field: None,
            docstore_compression: Compressor::default(),
            docstore_blocksize: default_docstore_blocksize(),
            docstore_compress_dedicated_thread: true,
@@ -283,22 +278,6 @@ impl Default for IndexSettings {
    }
 }

-/// Settings to presort the documents in an index
-///
-/// Presorting documents can greatly improve performance
-/// in some scenarios, by applying top n
-/// optimizations.
-#[deprecated(
-    since = "0.22.0",
-    note = "We plan to remove index sorting in `0.23`. If you need index sorting, please comment on the related issue https://github.com/quickwit-oss/tantivy/issues/2352 and explain your use case."
-)]
-#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
-pub struct IndexSortByField {
-    /// The field to sort the documents by
-    pub field: String,
-    /// The order to sort the documents by
-    pub order: Order,
-}
 /// The order to sort by
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub enum Order {
@@ -417,7 +396,7 @@ mod tests {
    use crate::store::Compressor;
    #[cfg(feature = "zstd-compression")]
    use crate::store::ZstdCompressor;
-    use crate::{IndexSettings, IndexSortByField, Order};
+    use crate::IndexSettings;

    #[test]
    fn test_serialize_metas() {
@@ -427,13 +406,7 @@ mod tests {
            schema_builder.build()
        };
        let index_metas = IndexMeta {
-            index_settings: IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "text".to_string(),
-                    order: Order::Asc,
-                }),
-                ..Default::default()
-            },
+            index_settings: IndexSettings::default(),
            segments: Vec::new(),
            schema,
            opstamp: 0u64,
@@ -442,7 +415,7 @@ mod tests {
        let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
        assert_eq!(
            json,
-            r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
+            r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
        );

        let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -461,10 +434,6 @@ mod tests {
        };
        let index_metas = IndexMeta {
            index_settings: IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "text".to_string(),
-                    order: Order::Asc,
-                }),
                docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
                    compression_level: Some(4),
                }),
@@ -479,7 +448,7 @@ mod tests {
        let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
        assert_eq!(
            json,
-            r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
+            r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
        );

        let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -491,35 +460,35 @@ mod tests {
    #[test]
    #[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
    fn test_serialize_metas_invalid_comp() {
-        let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
+        let json = r#"{"index_settings":{"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;

        let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
        assert_eq!(
            err.to_string(),
            "unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
-             `zstd(compression_level=5)` at line 1 column 96"
+             `zstd(compression_level=5)` at line 1 column 49"
                .to_string()
        );

-        let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
+        let json = r#"{"index_settings":{"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;

        let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
        assert_eq!(
            err.to_string(),
-            "unknown zstd option \"bla\" at line 1 column 103".to_string()
+            "unknown zstd option \"bla\" at line 1 column 56".to_string()
        );
    }

    #[test]
    #[cfg(not(feature = "zstd-compression"))]
    fn test_serialize_metas_unsupported_comp() {
-        let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
+        let json = r#"{"index_settings":{"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;

        let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
        assert_eq!(
            err.to_string(),
            "unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
-             line 1 column 95"
+             line 1 column 48"
                .to_string()
        );
    }
@@ -531,7 +500,6 @@ mod tests {
        assert_eq!(
            index_settings,
            IndexSettings {
-                sort_by_field: None,
                docstore_compression: Compressor::default(),
                docstore_compress_dedicated_thread: true,
                docstore_blocksize: 16_384
--- a/src/index/mod.rs
+++ b/src/index/mod.rs
@@ -12,7 +12,7 @@ mod segment_reader;

 pub use self::index::{Index, IndexBuilder};
 pub(crate) use self::index_meta::SegmentMetaInventory;
-pub use self::index_meta::{IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta};
+pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
 pub use self::inverted_index_reader::InvertedIndexReader;
 pub use self::segment::Segment;
 pub use self::segment_component::SegmentComponent;
--- a/src/indexer/doc_id_mapping.rs
+++ b/src/indexer/doc_id_mapping.rs
@@ -3,15 +3,12 @@

 use common::ReadOnlyBitSet;

-use super::SegmentWriter;
-use crate::schema::{Field, Schema};
-use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
+use crate::DocAddress;

 #[derive(Copy, Clone, Eq, PartialEq)]
 pub enum MappingType {
    Stacked,
    StackedWithDeletes,
-    Shuffled,
 }

 /// Struct to provide mapping from new doc_id to old doc_id and segment.
@@ -46,537 +43,4 @@ impl SegmentDocIdMapping {
    pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
        self.new_doc_id_to_old_doc_addr.iter().copied()
    }
-
-    /// This flags means the segments are simply stacked in the order of their ordinal.
-    /// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
-    ///
-    /// The different segment may present some deletes, in which case it is expressed by skipping a
-    /// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted
-    ///
-    /// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted.
-    ///
-    /// This allows for some optimization.
-    pub(crate) fn is_trivial(&self) -> bool {
-        match self.mapping_type {
-            MappingType::Stacked | MappingType::StackedWithDeletes => true,
-            MappingType::Shuffled => false,
-        }
-    }
-}
-
-/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
-pub struct DocIdMapping {
-    new_doc_id_to_old: Vec<DocId>,
-    old_doc_id_to_new: Vec<DocId>,
-}
-
-impl DocIdMapping {
-    pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
-        let max_doc = new_doc_id_to_old.len();
-        let old_max_doc = new_doc_id_to_old
-            .iter()
-            .cloned()
-            .max()
-            .map(|n| n + 1)
-            .unwrap_or(0);
-        let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
-        for i in 0..max_doc {
-            old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
-        }
-        DocIdMapping {
-            new_doc_id_to_old,
-            old_doc_id_to_new,
-        }
-    }
-
-    /// returns the new doc_id for the old doc_id
-    pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
-        self.old_doc_id_to_new[doc_id as usize]
-    }
-    /// returns the old doc_id for the new doc_id
-    pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
-        self.new_doc_id_to_old[doc_id as usize]
-    }
-    /// iterate over old doc_ids in order of the new doc_ids
-    pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
-        self.new_doc_id_to_old.iter().cloned()
-    }
-
-    pub fn old_to_new_ids(&self) -> &[DocId] {
-        &self.old_doc_id_to_new[..]
-    }
-
-    /// Remaps a given array to the new doc ids.
-    pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
-        self.new_doc_id_to_old
-            .iter()
-            .map(|old_doc| els[*old_doc as usize])
-            .collect()
-    }
-    pub fn num_new_doc_ids(&self) -> usize {
-        self.new_doc_id_to_old.len()
-    }
-    pub fn num_old_doc_ids(&self) -> usize {
-        self.old_doc_id_to_new.len()
-    }
-}
-
-pub(crate) fn expect_field_id_for_sort_field(
-    schema: &Schema,
-    sort_by_field: &IndexSortByField,
-) -> crate::Result<Field> {
-    schema.get_field(&sort_by_field.field).map_err(|_| {
-        TantivyError::InvalidArgument(format!(
-            "field to sort index by not found: {:?}",
-            sort_by_field.field
-        ))
-    })
-}
-
-// Generates a document mapping in the form of [index new doc_id] -> old doc_id
-// TODO detect if field is already sorted and discard mapping
-pub(crate) fn get_doc_id_mapping_from_field(
-    sort_by_field: IndexSortByField,
-    segment_writer: &SegmentWriter,
-) -> crate::Result<DocIdMapping> {
-    let schema = segment_writer.segment_serializer.segment().schema();
-    expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
-    let new_doc_id_to_old = segment_writer.fast_field_writers.sort_order(
-        sort_by_field.field.as_str(),
-        segment_writer.max_doc(),
-        sort_by_field.order.is_desc(),
-    );
-    // create new doc_id to old doc_id index (used in fast_field_writers)
-    Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
-}
-
-#[cfg(test)]
-mod tests_indexsorting {
-    use common::DateTime;
-
-    use crate::collector::TopDocs;
-    use crate::indexer::doc_id_mapping::DocIdMapping;
-    use crate::indexer::NoMergePolicy;
-    use crate::query::QueryParser;
-    use crate::schema::*;
-    use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};
-
-    fn create_test_index(
-        index_settings: Option<IndexSettings>,
-        text_field_options: TextOptions,
-    ) -> crate::Result<Index> {
-        let mut schema_builder = Schema::builder();
-
-        let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
-        let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
-        let my_number =
-            schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
-
-        let multi_numbers =
-            schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
-
-        let schema = schema_builder.build();
-        let mut index_builder = Index::builder().schema(schema);
-        if let Some(settings) = index_settings {
-            index_builder = index_builder.settings(settings);
-        }
-        let index = index_builder.create_in_ram()?;
-
-        let mut index_writer = index.writer_for_tests()?;
-        index_writer.add_document(doc!(my_number=>40_u64))?;
-        index_writer.add_document(
-            doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
-        )?;
-        index_writer.add_document(doc!(my_number=>100_u64))?;
-        index_writer.add_document(
-            doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
-        )?;
-        index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
-        index_writer.commit()?;
-        Ok(index)
-    }
-    fn get_text_options() -> TextOptions {
-        TextOptions::default().set_indexing_options(
-            TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic),
-        )
-    }
-    #[test]
-    fn test_sort_index_test_text_field() -> crate::Result<()> {
-        // there are different serializers for different settings in postings/recorder.rs
-        // test remapping for all of them
-        let options = vec![
-            get_text_options(),
-            get_text_options().set_indexing_options(
-                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
-            ),
-            get_text_options().set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
-            ),
-        ];
-
-        for option in options {
-            // let options = get_text_options();
-            // no index_sort
-            let index = create_test_index(None, option.clone())?;
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let searcher = index.reader()?.searcher();
-
-            let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(3))?;
-            assert_eq!(
-                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-                vec![3]
-            );
-
-            // sort by field asc
-            let index = create_test_index(
-                Some(IndexSettings {
-                    sort_by_field: Some(IndexSortByField {
-                        field: "my_number".to_string(),
-                        order: Order::Asc,
-                    }),
-                    ..Default::default()
-                }),
-                option.clone(),
-            )?;
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let reader = index.reader()?;
-            let searcher = reader.searcher();
-
-            let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(3))?;
-            assert_eq!(
-                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-                vec![0]
-            );
-
-            // test new field norm mapping
-            {
-                let my_text_field = index.schema().get_field("text_field").unwrap();
-                let fieldnorm_reader = searcher
-                    .segment_reader(0)
-                    .get_fieldnorms_reader(my_text_field)?;
-                assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
-                assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
-            }
-            // sort by field desc
-            let index = create_test_index(
-                Some(IndexSettings {
-                    sort_by_field: Some(IndexSortByField {
-                        field: "my_number".to_string(),
-                        order: Order::Desc,
-                    }),
-                    ..Default::default()
-                }),
-                option.clone(),
-            )?;
-            let my_string_field = index.schema().get_field("text_field").unwrap();
-            let searcher = index.reader()?.searcher();
-
-            let query =
-                QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?;
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(3))?;
-            assert_eq!(
-                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-                vec![4]
-            );
-            // test new field norm mapping
-            {
-                let my_text_field = index.schema().get_field("text_field").unwrap();
-                let fieldnorm_reader = searcher
-                    .segment_reader(0)
-                    .get_fieldnorms_reader(my_text_field)?;
-                assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
-                assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
-                assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
-                assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
-                assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
-            }
-        }
-        Ok(())
-    }
-    #[test]
-    fn test_sort_index_get_documents() -> crate::Result<()> {
-        // default baseline
-        let index = create_test_index(None, get_text_options())?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let searcher = index.reader()?.searcher();
-        {
-            assert!(searcher
-                .doc::<TantivyDocument>(DocAddress::new(0, 0))?
-                .get_first(my_string_field)
-                .is_none());
-            assert_eq!(
-                searcher
-                    .doc::<TantivyDocument>(DocAddress::new(0, 3))?
-                    .get_first(my_string_field)
-                    .unwrap()
-                    .as_str(),
-                Some("blublub")
-            );
-        }
-        // sort by field asc
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "my_number".to_string(),
-                    order: Order::Asc,
-                }),
-                ..Default::default()
-            }),
-            get_text_options(),
-        )?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let searcher = index.reader()?.searcher();
-        {
-            assert_eq!(
-                searcher
-                    .doc::<TantivyDocument>(DocAddress::new(0, 0))?
-                    .get_first(my_string_field)
-                    .unwrap()
-                    .as_str(),
-                Some("blublub")
-            );
-            let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
-            assert!(doc.get_first(my_string_field).is_none());
-        }
-        // sort by field desc
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "my_number".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            get_text_options(),
-        )?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let searcher = index.reader()?.searcher();
-        {
-            let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
-            assert_eq!(
-                doc.get_first(my_string_field).unwrap().as_str(),
-                Some("blublub")
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn test_sort_index_test_string_field() -> crate::Result<()> {
-        let index = create_test_index(None, get_text_options())?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let searcher = index.reader()?.searcher();
-
-        let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
-        let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
-        assert_eq!(
-            top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-            vec![3]
-        );
-
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "my_number".to_string(),
-                    order: Order::Asc,
-                }),
-                ..Default::default()
-            }),
-            get_text_options(),
-        )?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let reader = index.reader()?;
-        let searcher = reader.searcher();
-
-        let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
-        let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
-        assert_eq!(
-            top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-            vec![0]
-        );
-
-        // test new field norm mapping
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let fieldnorm_reader = searcher
-                .segment_reader(0)
-                .get_fieldnorms_reader(my_text_field)?;
-            assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
-            assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
-        }
-        // sort by field desc
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "my_number".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            get_text_options(),
-        )?;
-        let my_string_field = index.schema().get_field("string_field").unwrap();
-        let searcher = index.reader()?.searcher();
-
-        let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
-        let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
-        assert_eq!(
-            top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
-            vec![4]
-        );
-        // test new field norm mapping
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let fieldnorm_reader = searcher
-                .segment_reader(0)
-                .get_fieldnorms_reader(my_text_field)?;
-            assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
-            assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
-            assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
-            assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
-            assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn test_sort_index_fast_field() -> crate::Result<()> {
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "my_number".to_string(),
-                    order: Order::Asc,
-                }),
-                ..Default::default()
-            }),
-            get_text_options(),
-        )?;
-        assert_eq!(
-            index.settings().sort_by_field.as_ref().unwrap().field,
-            "my_number".to_string()
-        );
-
-        let searcher = index.reader()?.searcher();
-        assert_eq!(searcher.segment_readers().len(), 1);
-        let segment_reader = searcher.segment_reader(0);
-        let fast_fields = segment_reader.fast_fields();
-
-        let fast_field = fast_fields
-            .u64("my_number")
-            .unwrap()
-            .first_or_default_col(999);
-        assert_eq!(fast_field.get_val(0), 10u64);
-        assert_eq!(fast_field.get_val(1), 20u64);
-        assert_eq!(fast_field.get_val(2), 30u64);
-
-        let multifield = fast_fields.u64("multi_numbers").unwrap();
-        let vals: Vec<u64> = multifield.values_for_doc(0u32).collect();
-        assert_eq!(vals, &[] as &[u64]);
-        let vals: Vec<_> = multifield.values_for_doc(1u32).collect();
-        assert_eq!(vals, &[5, 6]);
-
-        let vals: Vec<_> = multifield.values_for_doc(2u32).collect();
-        assert_eq!(vals, &[3]);
-        Ok(())
-    }
-
-    #[test]
-    fn test_with_sort_by_date_field() -> crate::Result<()> {
-        let mut schema_builder = Schema::builder();
-        let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST);
-        let schema = schema_builder.build();
-
-        let settings = IndexSettings {
-            sort_by_field: Some(IndexSortByField {
-                field: "date".to_string(),
-                order: Order::Desc,
-            }),
-            ..Default::default()
-        };
-
-        let index = Index::builder()
-            .schema(schema)
-            .settings(settings)
-            .create_in_ram()?;
-        let mut index_writer = index.writer_for_tests()?;
-        index_writer.set_merge_policy(Box::new(NoMergePolicy));
-
-        index_writer.add_document(doc!(
-            date_field => DateTime::from_timestamp_secs(1000),
-        ))?;
-        index_writer.add_document(doc!(
-            date_field => DateTime::from_timestamp_secs(999),
-        ))?;
-        index_writer.add_document(doc!(
-            date_field => DateTime::from_timestamp_secs(1001),
-        ))?;
-        index_writer.commit()?;
-
-        let searcher = index.reader()?.searcher();
-        assert_eq!(searcher.segment_readers().len(), 1);
-        let segment_reader = searcher.segment_reader(0);
-        let fast_fields = segment_reader.fast_fields();
-
-        let fast_field = fast_fields
-            .date("date")
-            .unwrap()
-            .first_or_default_col(DateTime::from_timestamp_secs(0));
-        assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001));
-        assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000));
-        assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999));
-        Ok(())
-    }
-
-    #[test]
-    fn test_doc_mapping() {
-        let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
-        assert_eq!(doc_mapping.get_old_doc_id(0), 3);
-        assert_eq!(doc_mapping.get_old_doc_id(1), 2);
-        assert_eq!(doc_mapping.get_old_doc_id(2), 5);
-        assert_eq!(doc_mapping.get_new_doc_id(0), 0);
-        assert_eq!(doc_mapping.get_new_doc_id(1), 0);
-        assert_eq!(doc_mapping.get_new_doc_id(2), 1);
-        assert_eq!(doc_mapping.get_new_doc_id(3), 0);
-        assert_eq!(doc_mapping.get_new_doc_id(4), 0);
-        assert_eq!(doc_mapping.get_new_doc_id(5), 2);
-    }
-
-    #[test]
-    fn test_doc_mapping_remap() {
-        let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
-        assert_eq!(
-            &doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
-            &[2000, 8000, 3000]
-        );
-    }
-
-    #[test]
-    fn test_text_sort() -> crate::Result<()> {
-        let mut schema_builder = SchemaBuilder::new();
-        schema_builder.add_text_field("id", STRING | FAST | STORED);
-        schema_builder.add_text_field("name", TEXT | STORED);
-
-        let resp = IndexBuilder::new()
-            .schema(schema_builder.build())
-            .settings(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "id".to_string(),
-                    order: Order::Asc,
-                }),
-                ..Default::default()
-            })
-            .create_in_ram();
-        assert!(resp
-            .unwrap_err()
-            .to_string()
-            .contains("Unsupported field type"));
-
-        Ok(())
-    }
 }
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
--- a/src/indexer/merge_index_test.rs
+++ b/src/indexer/merge_index_test.rs
@@ -0,0 +1,147 @@
+#[cfg(test)]
+mod tests {
+    use crate::collector::TopDocs;
+    use crate::fastfield::AliveBitSet;
+    use crate::index::Index;
+    use crate::postings::Postings;
+    use crate::query::QueryParser;
+    use crate::schema::{
+        self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
+        TextFieldIndexing, TextOptions,
+    };
+    use crate::{DocAddress, DocSet, IndexSettings, IndexWriter, Term};
+
+    fn create_test_index(index_settings: Option<IndexSettings>) -> crate::Result<Index> {
+        let mut schema_builder = schema::Schema::builder();
+        let int_options = NumericOptions::default()
+            .set_fast()
+            .set_stored()
+            .set_indexed();
+        let int_field = schema_builder.add_u64_field("intval", int_options);
+
+        let bytes_options = BytesOptions::default().set_fast().set_indexed();
+        let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
+        let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
+
+        let multi_numbers =
+            schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
+        let text_field_options = TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
+            )
+            .set_stored();
+        let text_field = schema_builder.add_text_field("text_field", text_field_options);
+        let schema = schema_builder.build();
+
+        let mut index_builder = Index::builder().schema(schema);
+        if let Some(settings) = index_settings {
+            index_builder = index_builder.settings(settings);
+        }
+        let index = index_builder.create_in_ram()?;
+
+        {
+            let mut index_writer = index.writer_for_tests()?;
+
+            // segment 1 - range 1-3
+            index_writer.add_document(doc!(int_field=>1_u64))?;
+            index_writer.add_document(
+                doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
+            )?;
+            index_writer.add_document(
+                doc!(int_field=>1_u64, text_field=> "deleteme",  text_field => "ok text more text"),
+            )?;
+            index_writer.add_document(
+                doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
+            )?;
+
+            index_writer.commit()?;
+            index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
+
+            let in_val = 1u64;
+            index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
+            index_writer.commit()?;
+            let int_vals = [10u64, 5];
+            index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
+                doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
+            )?;
+            index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
+            index_writer.add_document(
+                doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
+            )?;
+
+            index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
+            index_writer.commit()?;
+        }
+
+        // Merging the segments
+        {
+            let segment_ids = index.searchable_segment_ids()?;
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.merge(&segment_ids).wait()?;
+            index_writer.wait_merging_threads()?;
+        }
+        Ok(index)
+    }
+
+    #[test]
+    fn test_merge_index() {
+        let index = create_test_index(Some(IndexSettings {
+            ..Default::default()
+        }))
+        .unwrap();
+
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        assert_eq!(searcher.segment_readers().len(), 1);
+        let segment_reader = searcher.segment_readers().last().unwrap();
+
+        let searcher = index.reader().unwrap().searcher();
+        {
+            let my_text_field = index.schema().get_field("text_field").unwrap();
+
+            let do_search = |term: &str| {
+                let query = QueryParser::for_index(&index, vec![my_text_field])
+                    .parse_query(term)
+                    .unwrap();
+                let top_docs: Vec<(f32, DocAddress)> =
+                    searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
+
+                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
+            };
+
+            assert_eq!(do_search("some"), vec![1]);
+            assert_eq!(do_search("blubber"), vec![3]);
+            assert_eq!(do_search("biggest"), vec![4]);
+        }
+
+        // postings file
+        {
+            let my_text_field = index.schema().get_field("text_field").unwrap();
+            let term_a = Term::from_field_text(my_text_field, "text");
+            let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
+            let mut postings = inverted_index
+                .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
+                .unwrap()
+                .unwrap();
+            assert_eq!(postings.doc_freq(), 2);
+            let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
+            assert_eq!(
+                postings.doc_freq_given_deletes(
+                    segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
+                ),
+                2
+            );
+
+            assert_eq!(postings.term_freq(), 1);
+            let mut output = vec![];
+            postings.positions(&mut output);
+            assert_eq!(output, vec![1]);
+            postings.advance();
+
+            assert_eq!(postings.term_freq(), 2);
+            postings.positions(&mut output);
+            assert_eq!(output, vec![1, 3]);
+        }
+    }
+}
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1,8 +1,7 @@
 use std::sync::Arc;

 use columnar::{
-    ColumnType, ColumnValues, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder,
-    StackMergeOrder,
+    ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder,
 };
 use common::ReadOnlyBitSet;
 use itertools::Itertools;
@@ -11,7 +10,7 @@ use measure_time::debug_time;
 use crate::directory::WritePtr;
 use crate::docset::{DocSet, TERMINATED};
 use crate::error::DataCorruption;
-use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
+use crate::fastfield::AliveBitSet;
 use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
 use crate::index::{Segment, SegmentComponent, SegmentReader};
 use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
@@ -20,9 +19,7 @@ use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
 use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
 use crate::store::StoreWriter;
 use crate::termdict::{TermMerger, TermOrdinal};
-use crate::{
-    DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
-};
+use crate::{DocAddress, DocId, InvertedIndexReader};

 /// Segment's max doc must be `< MAX_DOC_LIMIT`.
 ///
@@ -80,7 +77,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
 }

 pub struct IndexMerger {
-    index_settings: IndexSettings,
    schema: Schema,
    pub(crate) readers: Vec<SegmentReader>,
    max_doc: u32,
@@ -116,7 +112,7 @@ fn convert_to_merge_order(
 ) -> MergeRowOrder {
    match doc_id_mapping.mapping_type() {
        MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)),
-        MappingType::StackedWithDeletes | MappingType::Shuffled => {
+        MappingType::StackedWithDeletes => {
            // RUST/LLVM is amazing. The following conversion is actually a no-op:
            // no allocation, no copy.
            let new_row_id_to_old_row_id: Vec<RowAddr> = doc_id_mapping
@@ -149,13 +145,9 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
 }

 impl IndexMerger {
-    pub fn open(
-        schema: Schema,
-        index_settings: IndexSettings,
-        segments: &[Segment],
-    ) -> crate::Result<IndexMerger> {
+    pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
        let alive_bitset = segments.iter().map(|_| None).collect_vec();
-        Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
+        Self::open_with_custom_alive_set(schema, segments, alive_bitset)
    }

    // Create merge with a custom delete set.
@@ -172,7 +164,6 @@ impl IndexMerger {
    // segments and partitions them e.g. by a value in a field.
    pub fn open_with_custom_alive_set(
        schema: Schema,
-        index_settings: IndexSettings,
        segments: &[Segment],
        alive_bitset_opt: Vec<Option<AliveBitSet>>,
    ) -> crate::Result<IndexMerger> {
@@ -186,9 +177,6 @@ impl IndexMerger {
        }

        let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
-        if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
-            readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
-        }
        // sort segments by their natural sort setting
        if max_doc >= MAX_DOC_LIMIT {
            let err_msg = format!(
@@ -198,37 +186,12 @@ impl IndexMerger {
            return Err(crate::TantivyError::InvalidArgument(err_msg));
        }
        Ok(IndexMerger {
-            index_settings,
            schema,
            readers,
            max_doc,
        })
    }

-    fn sort_readers_by_min_sort_field(
-        readers: Vec<SegmentReader>,
-        sort_by_field: &IndexSortByField,
-    ) -> crate::Result<Vec<SegmentReader>> {
-        // presort the readers by their min_values, so that when they are disjunct, we can use
-        // the regular merge logic (implicitly sorted)
-        let mut readers_with_min_sort_values = readers
-            .into_iter()
-            .map(|reader| {
-                let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
-                Ok((reader, accessor.min_value()))
-            })
-            .collect::<crate::Result<Vec<_>>>()?;
-        if sort_by_field.order.is_asc() {
-            readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
-        } else {
-            readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
-        }
-        Ok(readers_with_min_sort_values
-            .into_iter()
-            .map(|(reader, _)| reader)
-            .collect())
-    }
-
    fn write_fieldnorms(
        &self,
        mut fieldnorms_serializer: FieldNormsSerializer,
@@ -276,128 +239,6 @@ impl IndexMerger {
        Ok(())
    }

-    /// Checks if the readers are disjunct for their sort property and in the correct order to be
-    /// able to just stack them.
-    pub(crate) fn is_disjunct_and_sorted_on_sort_property(
-        &self,
-        sort_by_field: &IndexSortByField,
-    ) -> crate::Result<bool> {
-        let reader_ordinal_and_field_accessors =
-            self.get_reader_with_sort_field_accessor(sort_by_field)?;
-
-        let everything_is_in_order = reader_ordinal_and_field_accessors
-            .into_iter()
-            .map(|(_, col)| Arc::new(col))
-            .tuple_windows()
-            .all(|(field_accessor1, field_accessor2)| {
-                if sort_by_field.order.is_asc() {
-                    field_accessor1.max_value() <= field_accessor2.min_value()
-                } else {
-                    field_accessor1.min_value() >= field_accessor2.max_value()
-                }
-            });
-        Ok(everything_is_in_order)
-    }
-
-    pub(crate) fn get_sort_field_accessor(
-        reader: &SegmentReader,
-        sort_by_field: &IndexSortByField,
-    ) -> crate::Result<Arc<dyn ColumnValues>> {
-        reader.schema().get_field(&sort_by_field.field)?;
-        let (value_accessor, _column_type) = reader
-            .fast_fields()
-            .u64_lenient(&sort_by_field.field)?
-            .ok_or_else(|| FastFieldNotAvailableError {
-                field_name: sort_by_field.field.to_string(),
-            })?;
-        Ok(value_accessor.first_or_default_col(0u64))
-    }
-    /// Collecting value_accessors into a vec to bind the lifetime.
-    pub(crate) fn get_reader_with_sort_field_accessor(
-        &self,
-        sort_by_field: &IndexSortByField,
-    ) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn ColumnValues>)>> {
-        let reader_ordinal_and_field_accessors = self
-            .readers
-            .iter()
-            .enumerate()
-            .map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal)
-            .map(|reader_ordinal: SegmentOrdinal| {
-                let value_accessor = Self::get_sort_field_accessor(
-                    &self.readers[reader_ordinal as usize],
-                    sort_by_field,
-                )?;
-                Ok((reader_ordinal, value_accessor))
-            })
-            .collect::<crate::Result<Vec<_>>>()?;
-        Ok(reader_ordinal_and_field_accessors)
-    }
-
-    /// Generates the doc_id mapping where position in the vec=new
-    /// doc_id.
-    /// ReaderWithOrdinal will include the ordinal position of the
-    /// reader in self.readers.
-    pub(crate) fn generate_doc_id_mapping_with_sort_by_field(
-        &self,
-        sort_by_field: &IndexSortByField,
-    ) -> crate::Result<SegmentDocIdMapping> {
-        let reader_ordinal_and_field_accessors =
-            self.get_reader_with_sort_field_accessor(sort_by_field)?;
-        // Loading the field accessor on demand causes a 15x regression
-
-        // create iterators over segment/sort_accessor/doc_id  tuple
-        let doc_id_reader_pair =
-            reader_ordinal_and_field_accessors
-                .iter()
-                .map(|(reader_ord, ff_reader)| {
-                    let reader = &self.readers[*reader_ord as usize];
-                    reader
-                        .doc_ids_alive()
-                        .map(move |doc_id| (doc_id, reader_ord, ff_reader))
-                });
-
-        let total_num_new_docs = self
-            .readers
-            .iter()
-            .map(|reader| reader.num_docs() as usize)
-            .sum();
-
-        let mut sorted_doc_ids: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
-
-        // create iterator tuple of (old doc_id, reader) in order of the new doc_ids
-        sorted_doc_ids.extend(
-            doc_id_reader_pair
-                .into_iter()
-                .kmerge_by(|a, b| {
-                    let val1 = a.2.get_val(a.0);
-                    let val2 = b.2.get_val(b.0);
-                    if sort_by_field.order == Order::Asc {
-                        val1 < val2
-                    } else {
-                        val1 > val2
-                    }
-                })
-                .map(|(doc_id, &segment_ord, _)| DocAddress {
-                    doc_id,
-                    segment_ord,
-                }),
-        );
-
-        let alive_bitsets: Vec<Option<ReadOnlyBitSet>> = self
-            .readers
-            .iter()
-            .map(|segment_reader| {
-                let alive_bitset = segment_reader.alive_bitset()?;
-                Some(alive_bitset.bitset().clone())
-            })
-            .collect();
-        Ok(SegmentDocIdMapping::new(
-            sorted_doc_ids,
-            MappingType::Shuffled,
-            alive_bitsets,
-        ))
-    }
-
    /// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
    /// index sorting and the others
    pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
@@ -515,7 +356,6 @@ impl IndexMerger {
        );

        let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
-        let mut doc_id_and_positions = vec![];

        while merged_terms.advance() {
            segment_postings_containing_the_term.clear();
@@ -611,37 +451,13 @@ impl IndexMerger {
                            0u32
                        };

-                        // if doc_id_mapping exists, the doc_ids are reordered, they are
-                        // not just stacked. The field serializer expects monotonically increasing
-                        // doc_ids, so we collect and sort them first, before writing.
-                        //
-                        // I think this is not strictly necessary, it would be possible to
-                        // avoid the loading into a vec via some form of kmerge, but then the merge
-                        // logic would deviate much more from the stacking case (unsorted index)
-                        if !doc_id_mapping.is_trivial() {
-                            doc_id_and_positions.push((
-                                remapped_doc_id,
-                                term_freq,
-                                positions_buffer.to_vec(),
-                            ));
-                        } else {
-                            let delta_positions = delta_computer.compute_delta(&positions_buffer);
-                            field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
-                        }
+                        let delta_positions = delta_computer.compute_delta(&positions_buffer);
+                        field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
                    }

                    doc = segment_postings.advance();
                }
            }
-            if !doc_id_mapping.is_trivial() {
-                doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
-
-                for (doc_id, term_freq, positions) in &doc_id_and_positions {
-                    let delta_positions = delta_computer.compute_delta(positions);
-                    field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
-                }
-                doc_id_and_positions.clear();
-            }
            // closing the term.
            field_serializer.close_term()?;
        }
@@ -670,47 +486,13 @@ impl IndexMerger {
        Ok(())
    }

-    fn write_storable_fields(
-        &self,
-        store_writer: &mut StoreWriter,
-        doc_id_mapping: &SegmentDocIdMapping,
-    ) -> crate::Result<()> {
+    fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> {
        debug_time!("write-storable-fields");
        debug!("write-storable-field");

-        if !doc_id_mapping.is_trivial() {
-            debug!("non-trivial-doc-id-mapping");
-
-            let store_readers: Vec<_> = self
-                .readers
-                .iter()
-                .map(|reader| reader.get_store_reader(50))
-                .collect::<Result<_, _>>()?;
-
-            let mut document_iterators: Vec<_> = store_readers
-                .iter()
-                .enumerate()
-                .map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
-                .collect();
-
-            for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
-                let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
-                if let Some(doc_bytes_res) = doc_bytes_it.next() {
-                    let doc_bytes = doc_bytes_res?;
-                    store_writer.store_bytes(&doc_bytes)?;
-                } else {
-                    return Err(DataCorruption::comment_only(format!(
-                        "unexpected missing document in docstore on merge, doc address \
-                         {old_doc_addr:?}",
-                    ))
-                    .into());
-                }
-            }
-        } else {
-            debug!("trivial-doc-id-mapping");
-            for reader in &self.readers {
-                let store_reader = reader.get_store_reader(1)?;
-                if reader.has_deletes()
+        for reader in &self.readers {
+            let store_reader = reader.get_store_reader(1)?;
+            if reader.has_deletes()
                    // If there is not enough data in the store, we avoid stacking in order to
                    // avoid creating many small blocks in the doc store. Once we have 5 full blocks,
                    // we start stacking. In the worst case 2/7 of the blocks would be very small.
@@ -726,14 +508,13 @@ impl IndexMerger {
                    // take 7 in order to not walk over all checkpoints.
                    || store_reader.block_checkpoints().take(7).count() < 6
                    || store_reader.decompressor() != store_writer.compressor().into()
-                {
-                    for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
-                        let doc_bytes = doc_bytes_res?;
-                        store_writer.store_bytes(&doc_bytes)?;
-                    }
-                } else {
-                    store_writer.stack(store_reader)?;
+            {
+                for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
+                    let doc_bytes = doc_bytes_res?;
+                    store_writer.store_bytes(&doc_bytes)?;
                }
+            } else {
+                store_writer.stack(store_reader)?;
            }
        }
        Ok(())
@@ -745,18 +526,7 @@ impl IndexMerger {
    /// # Returns
    /// The number of documents in the resulting segment.
    pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
-        let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
-        {
-            // If the documents are already sorted and stackable, we ignore the mapping and execute
-            // it as if there was no sorting
-            if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
-                self.get_doc_id_from_concatenated_data()?
-            } else {
-                self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)?
-            }
-        } else {
-            self.get_doc_id_from_concatenated_data()?
-        };
+        let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
        debug!("write-fieldnorms");
        if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
            self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
@@ -773,7 +543,7 @@ impl IndexMerger {
        )?;

        debug!("write-storagefields");
-        self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
+        self.write_storable_fields(serializer.get_store_writer())?;
        debug!("write-fastfields");
        self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;

@@ -787,6 +557,8 @@ impl IndexMerger {
 mod tests {

    use columnar::Column;
+    use proptest::prop_oneof;
+    use proptest::strategy::Strategy;
    use schema::FAST;

    use crate::collector::tests::{
@@ -794,6 +566,7 @@ mod tests {
    };
    use crate::collector::{Count, FacetCollector};
    use crate::index::{Index, SegmentId};
+    use crate::indexer::NoMergePolicy;
    use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
    use crate::schema::{
        Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
@@ -802,7 +575,7 @@ mod tests {
    use crate::time::OffsetDateTime;
    use crate::{
        assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
-        IndexSortByField, IndexWriter, Order, Searcher,
+        IndexWriter, Searcher,
    };

    #[test]
@@ -1275,60 +1048,6 @@ mod tests {
        test_merge_facets(None, true)
    }

-    #[test]
-    fn test_merge_facets_sort_asc() {
-        // In the merge case this will go through the doc_id mapping code
-        test_merge_facets(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "intval".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            true,
-        );
-        // In the merge case this will not go through the doc_id mapping code, because the data
-        // sorted and disjunct
-        test_merge_facets(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "intval".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            false,
-        );
-    }
-
-    #[test]
-    fn test_merge_facets_sort_desc() {
-        // In the merge case this will go through the doc_id mapping code
-        test_merge_facets(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "intval".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            true,
-        );
-        // In the merge case this will not go through the doc_id mapping code, because the data
-        // sorted and disjunct
-        test_merge_facets(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "intval".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            false,
-        );
-    }
-
    // force_segment_value_overlap forces the int value for sorting to have overlapping min and max
    // ranges between segments so that merge algorithm can't apply certain optimizations
    fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
@@ -1531,6 +1250,112 @@ mod tests {
        Ok(())
    }

+    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
+    enum IndexingOp {
+        ZeroVal,
+        OneVal { val: u64 },
+        TwoVal { val: u64 },
+        Commit,
+    }
+
+    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
+        prop_oneof![
+            (0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
+            (0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
+            (0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
+            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
+        ]
+    }
+
+    use proptest::prelude::*;
+    proptest! {
+        #[test]
+        fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
+            assert!(test_merge_int_fields(&ops[..]).is_ok());
+        }
+    }
+    fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
+        if ops.iter().all(|op| *op == IndexingOp::Commit) {
+            return Ok(());
+        }
+        let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
+            .iter()
+            .filter(|op| *op != &IndexingOp::Commit)
+            .map(|op| match op {
+                IndexingOp::ZeroVal => vec![],
+                IndexingOp::OneVal { val } => vec![*val],
+                IndexingOp::TwoVal { val } => vec![*val, *val],
+                IndexingOp::Commit => unreachable!(),
+            })
+            .enumerate()
+            .map(|(id, val)| (id as u32, val))
+            .collect();
+
+        let mut schema_builder = schema::Schema::builder();
+        let int_options = NumericOptions::default().set_fast().set_indexed();
+        let int_field = schema_builder.add_u64_field("intvals", int_options);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut index_writer = index.writer_for_tests()?;
+            index_writer.set_merge_policy(Box::new(NoMergePolicy));
+            let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
+                let mut doc = TantivyDocument::default();
+                for &val in int_vals {
+                    doc.add_u64(int_field, val);
+                }
+                index_writer.add_document(doc).unwrap();
+            };
+
+            for op in ops {
+                match op {
+                    IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
+                    IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
+                    IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
+                    IndexingOp::Commit => {
+                        index_writer.commit().expect("commit failed");
+                    }
+                }
+            }
+            index_writer.commit().expect("commit failed");
+        }
+        {
+            let mut segment_ids = index.searchable_segment_ids()?;
+            segment_ids.sort();
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.merge(&segment_ids).wait()?;
+            index_writer.wait_merging_threads()?;
+        }
+        let reader = index.reader()?;
+        reader.reload()?;
+
+        let mut vals: Vec<u64> = Vec::new();
+        let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
+            vals.clear();
+            vals.extend(col.values_for_doc(doc));
+            assert_eq!(&vals[..], expected);
+        };
+
+        let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
+            for (doc_id, vals) in column_expected.iter() {
+                test_vals(col, *doc_id, vals);
+            }
+        };
+
+        {
+            let searcher = reader.searcher();
+            let segment = searcher.segment_reader(0u32);
+            let col = segment
+                .fast_fields()
+                .column_opt::<u64>("intvals")
+                .unwrap()
+                .unwrap();
+
+            test_col(&col, &expected_doc_and_vals);
+        }
+
+        Ok(())
+    }
+
    #[test]
    fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
        let mut schema_builder = schema::Schema::builder();
--- a/src/indexer/merger_sorted_index_test.rs
+++ b/src/indexer/merger_sorted_index_test.rs
@@ -1,579 +0,0 @@
-#[cfg(test)]
-mod tests {
-    use crate::collector::TopDocs;
-    use crate::fastfield::AliveBitSet;
-    use crate::index::Index;
-    use crate::postings::Postings;
-    use crate::query::QueryParser;
-    use crate::schema::{
-        self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
-        TextFieldIndexing, TextOptions, Value,
-    };
-    use crate::{
-        DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
-        Term,
-    };
-
-    fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
-        let mut schema_builder = schema::Schema::builder();
-        let int_options = NumericOptions::default().set_fast().set_indexed();
-        let int_field = schema_builder.add_u64_field("intval", int_options);
-
-        let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
-
-        let schema = schema_builder.build();
-
-        let mut index_builder = Index::builder().schema(schema);
-        if let Some(settings) = index_settings {
-            index_builder = index_builder.settings(settings);
-        }
-        let index = index_builder.create_in_ram().unwrap();
-
-        {
-            let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
-            index_writer
-                .add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
-                .unwrap();
-            index_writer
-                .add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
-                .unwrap();
-            index_writer.commit().unwrap();
-            index_writer
-                .add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
-                .unwrap();
-            index_writer.commit().unwrap();
-        }
-
-        // Merging the segments
-        {
-            let segment_ids = index
-                .searchable_segment_ids()
-                .expect("Searchable segments failed.");
-            let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
-            assert!(index_writer.merge(&segment_ids).wait().is_ok());
-            assert!(index_writer.wait_merging_threads().is_ok());
-        }
-        index
-    }
-
-    // force_disjunct_segment_sort_values forces the field, by which the index is sorted have
-    // disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
-    fn create_test_index(
-        index_settings: Option<IndexSettings>,
-        force_disjunct_segment_sort_values: bool,
-    ) -> crate::Result<Index> {
-        let mut schema_builder = schema::Schema::builder();
-        let int_options = NumericOptions::default()
-            .set_fast()
-            .set_stored()
-            .set_indexed();
-        let int_field = schema_builder.add_u64_field("intval", int_options);
-
-        let bytes_options = BytesOptions::default().set_fast().set_indexed();
-        let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
-        let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
-
-        let multi_numbers =
-            schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
-        let text_field_options = TextOptions::default()
-            .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
-            )
-            .set_stored();
-        let text_field = schema_builder.add_text_field("text_field", text_field_options);
-        let schema = schema_builder.build();
-
-        let mut index_builder = Index::builder().schema(schema);
-        if let Some(settings) = index_settings {
-            index_builder = index_builder.settings(settings);
-        }
-        let index = index_builder.create_in_ram()?;
-
-        {
-            let mut index_writer = index.writer_for_tests()?;
-
-            // segment 1 - range 1-3
-            index_writer.add_document(doc!(int_field=>1_u64))?;
-            index_writer.add_document(
-                doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
-            )?;
-            index_writer.add_document(
-                doc!(int_field=>1_u64, text_field=> "deleteme",  text_field => "ok text more text"),
-            )?;
-            index_writer.add_document(
-                doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
-            )?;
-
-            index_writer.commit()?;
-            // segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
-            index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
-
-            let in_val = if force_disjunct_segment_sort_values {
-                10_u64
-            } else {
-                1
-            };
-            index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
-            index_writer.commit()?;
-            // segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
-            let int_vals = if force_disjunct_segment_sort_values {
-                [100_u64, 50]
-            } else {
-                [10, 5]
-            };
-            index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
-                doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
-            )?;
-            index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
-            index_writer.add_document(
-                doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
-            )?;
-
-            index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
-            index_writer.commit()?;
-        }
-
-        // Merging the segments
-        {
-            let segment_ids = index.searchable_segment_ids()?;
-            let mut index_writer: IndexWriter = index.writer_for_tests()?;
-            index_writer.merge(&segment_ids).wait()?;
-            index_writer.wait_merging_threads()?;
-        }
-        Ok(index)
-    }
-
-    #[test]
-    fn test_merge_sorted_postinglist_sort_issue() {
-        create_test_index_posting_list_issue(Some(IndexSettings {
-            sort_by_field: Some(IndexSortByField {
-                field: "intval".to_string(),
-                order: Order::Desc,
-            }),
-            ..Default::default()
-        }));
-    }
-
-    #[test]
-    fn test_merge_sorted_index_desc_not_disjunct() {
-        test_merge_sorted_index_desc_(false);
-    }
-
-    #[test]
-    fn test_merge_sorted_index_desc_disjunct() {
-        test_merge_sorted_index_desc_(true);
-    }
-
-    fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
-        let index = create_test_index(
-            Some(IndexSettings {
-                sort_by_field: Some(IndexSortByField {
-                    field: "intval".to_string(),
-                    order: Order::Desc,
-                }),
-                ..Default::default()
-            }),
-            force_disjunct_segment_sort_values,
-        )
-        .unwrap();
-
-        let int_field = index.schema().get_field("intval").unwrap();
-        let reader = index.reader().unwrap();
-
-        let searcher = reader.searcher();
-        assert_eq!(searcher.segment_readers().len(), 1);
-        let segment_reader = searcher.segment_readers().last().unwrap();
-
-        let fast_fields = segment_reader.fast_fields();
-        let fast_field = fast_fields.u64("intval").unwrap();
-        assert_eq!(fast_field.first(5), Some(1u64));
-        assert_eq!(fast_field.first(4), Some(2u64));
-        assert_eq!(fast_field.first(3), Some(3u64));
-        if force_disjunct_segment_sort_values {
-            assert_eq!(fast_field.first(2), Some(20u64));
-            assert_eq!(fast_field.first(1), Some(100u64));
-        } else {
-            assert_eq!(fast_field.first(2), Some(10u64));
-            assert_eq!(fast_field.first(1), Some(20u64));
-        }
-        assert_eq!(fast_field.first(0), Some(1_000u64));
-
-        // test new field norm mapping
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
-            assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
-            if force_disjunct_segment_sort_values {
-                assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
-                assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
-            } else {
-                assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
-                assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
-            }
-            assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
-            assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
-        }
-
-        let my_text_field = index.schema().get_field("text_field").unwrap();
-        let searcher = index.reader().unwrap().searcher();
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-
-            let do_search = |term: &str| {
-                let query = QueryParser::for_index(&index, vec![my_text_field])
-                    .parse_query(term)
-                    .unwrap();
-                let top_docs: Vec<(f32, DocAddress)> =
-                    searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
-
-                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
-            };
-
-            assert_eq!(do_search("some"), vec![3]);
-            if force_disjunct_segment_sort_values {
-                assert_eq!(do_search("blubber"), vec![1]);
-            } else {
-                assert_eq!(do_search("blubber"), vec![2]);
-            }
-            assert_eq!(do_search("biggest"), vec![0]);
-        }
-
-        // postings file
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let term_a = Term::from_field_text(my_text_field, "text");
-            let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
-            let mut postings = inverted_index
-                .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
-                .unwrap()
-                .unwrap();
-
-            assert_eq!(postings.doc_freq(), 2);
-            let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
-            assert_eq!(
-                postings.doc_freq_given_deletes(
-                    segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
-                ),
-                2
-            );
-
-            assert_eq!(postings.term_freq(), 1);
-            let mut output = vec![];
-            postings.positions(&mut output);
-            assert_eq!(output, vec![1]);
-            postings.advance();
-
-            assert_eq!(postings.term_freq(), 2);
-            postings.positions(&mut output);
-            assert_eq!(output, vec![1, 3]);
-        }
-
-        // access doc store
-        {
-            let blubber_pos = if force_disjunct_segment_sort_values {
-                1
-            } else {
-                2
-            };
-            let doc = searcher
-                .doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
-                .unwrap();
-            assert_eq!(
-                doc.get_first(my_text_field).unwrap().as_value().as_str(),
-                Some("blubber")
-            );
-            let doc = searcher
-                .doc::<TantivyDocument>(DocAddress::new(0, 0))
-                .unwrap();
-            assert_eq!(
-                doc.get_first(int_field).unwrap().as_value().as_u64(),
-                Some(1000)
-            );
-        }
-    }
-
-    #[test]
-    fn test_merge_unsorted_index() {
-        let index = create_test_index(
-            Some(IndexSettings {
-                ..Default::default()
-            }),
-            false,
-        )
-        .unwrap();
-
-        let reader = index.reader().unwrap();
-        let searcher = reader.searcher();
-        assert_eq!(searcher.segment_readers().len(), 1);
-        let segment_reader = searcher.segment_readers().last().unwrap();
-
-        let searcher = index.reader().unwrap().searcher();
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-
-            let do_search = |term: &str| {
-                let query = QueryParser::for_index(&index, vec![my_text_field])
-                    .parse_query(term)
-                    .unwrap();
-                let top_docs: Vec<(f32, DocAddress)> =
-                    searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
-
-                top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
-            };
-
-            assert_eq!(do_search("some"), vec![1]);
-            assert_eq!(do_search("blubber"), vec![3]);
-            assert_eq!(do_search("biggest"), vec![4]);
-        }
-
-        // postings file
-        {
-            let my_text_field = index.schema().get_field("text_field").unwrap();
-            let term_a = Term::from_field_text(my_text_field, "text");
-            let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
-            let mut postings = inverted_index
-                .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
-                .unwrap()
-                .unwrap();
-            assert_eq!(postings.doc_freq(), 2);
-            let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
-            assert_eq!(
-                postings.doc_freq_given_deletes(
-                    segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
-                ),
-                2
-            );
-
-            assert_eq!(postings.term_freq(), 1);
-            let mut output = vec![];
-            postings.positions(&mut output);
-            assert_eq!(output, vec![1]);
-            postings.advance();
-
-            assert_eq!(postings.term_freq(), 2);
-            postings.positions(&mut output);
-            assert_eq!(output, vec![1, 3]);
-        }
-    }
-
-    // #[test]
-    // fn test_merge_sorted_index_asc() {
-    //     let index = create_test_index(
-    //         Some(IndexSettings {
-    //             sort_by_field: Some(IndexSortByField {
-    //                 field: "intval".to_string(),
-    //                 order: Order::Asc,
-    //             }),
-    //             ..Default::default()
-    //         }),
-    //         false,
-    //     )
-    //     .unwrap();
-
-    //     let int_field = index.schema().get_field("intval").unwrap();
-    //     let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
-    //     let bytes_field = index.schema().get_field("bytes").unwrap();
-    //     let reader = index.reader().unwrap();
-    //     let searcher = reader.searcher();
-    //     assert_eq!(searcher.segment_readers().len(), 1);
-    //     let segment_reader = searcher.segment_readers().last().unwrap();
-
-    //     let fast_fields = segment_reader.fast_fields();
-    //     let fast_field = fast_fields.u64(int_field).unwrap();
-    //     assert_eq!(fast_field.get_val(0), 1u64);
-    //     assert_eq!(fast_field.get_val(1), 2u64);
-    //     assert_eq!(fast_field.get_val(2), 3u64);
-    //     assert_eq!(fast_field.get_val(3), 10u64);
-    //     assert_eq!(fast_field.get_val(4), 20u64);
-    //     assert_eq!(fast_field.get_val(5), 1_000u64);
-
-    //     let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
-    //         let mut vals = vec![];
-    //         fast_field.get_vals(doc_id, &mut vals);
-    //         vals
-    //     };
-    //     let fast_fields = segment_reader.fast_fields();
-    //     let fast_field = fast_fields.u64s(multi_numbers).unwrap();
-    //     assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
-    //     assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
-    //     assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
-    //     assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
-    //     assert_eq!(&get_vals(&fast_field, 4), &[20]);
-    //     assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
-
-    //     let fast_field = fast_fields.bytes(bytes_field).unwrap();
-    //     assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
-    //     assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
-    //     assert_eq!(fast_field.get_bytes(5), &[5, 5]);
-
-    //     // test new field norm mapping
-    //     {
-    //         let my_text_field = index.schema().get_field("text_field").unwrap();
-    //         let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
-    //         assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
-    //         assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
-    //         assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
-    //         assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
-    //         assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
-    //     }
-
-    //     let searcher = index.reader().unwrap().searcher();
-    //     {
-    //         let my_text_field = index.schema().get_field("text_field").unwrap();
-
-    //         let do_search = |term: &str| {
-    //             let query = QueryParser::for_index(&index, vec![my_text_field])
-    //                 .parse_query(term)
-    //                 .unwrap();
-    //             let top_docs: Vec<(f32, DocAddress)> =
-    //                 searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
-
-    //             top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
-    //         };
-
-    //         assert_eq!(do_search("some"), vec![2]);
-    //         assert_eq!(do_search("blubber"), vec![3]);
-    //         assert_eq!(do_search("biggest"), vec![5]);
-    //     }
-
-    //     // postings file
-    //     {
-    //         let my_text_field = index.schema().get_field("text_field").unwrap();
-    //         let term_a = Term::from_field_text(my_text_field, "text");
-    //         let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
-    //         let mut postings = inverted_index
-    //             .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
-    //             .unwrap()
-    //             .unwrap();
-
-    //         assert_eq!(postings.doc_freq(), 2);
-    //         let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
-    //         assert_eq!(
-    //             postings.doc_freq_given_deletes(
-    //                 segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
-    //             ),
-    //             2
-    //         );
-
-    //         let mut output = vec![];
-    //         postings.positions(&mut output);
-    //         assert_eq!(output, vec![1, 3]);
-    //         postings.advance();
-
-    //         postings.positions(&mut output);
-    //         assert_eq!(output, vec![1]);
-    //     }
-
-    //     // access doc store
-    //     {
-    //         let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
-    //         let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
-    //         let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
-    //         let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
-    //         let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
-    //         let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
-    //         assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
-    //     }
-    // }
-}
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench_sorted_index_merge {
-
-    use test::{self, Bencher};
-
-    use crate::index::Index;
-    use crate::indexer::merger::IndexMerger;
-    use crate::schema::{NumericOptions, Schema};
-    use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
-    fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
-        let mut schema_builder = Schema::builder();
-        let int_options = NumericOptions::default().set_fast().set_indexed();
-        let int_field = schema_builder.add_u64_field("intval", int_options);
-        let schema = schema_builder.build();
-
-        let index_builder = Index::builder().schema(schema).settings(IndexSettings {
-            sort_by_field,
-            ..Default::default()
-        });
-        let index = index_builder.create_in_ram().unwrap();
-
-        {
-            let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
-            let index_doc = |index_writer: &mut IndexWriter, val: u64| {
-                index_writer.add_document(doc!(int_field=>val)).unwrap();
-            };
-            // 3 segments with 10_000 values in the fast fields
-            for _ in 0..3 {
-                index_doc(&mut index_writer, 5000); // fix to make it unordered
-                for i in 0..10_000 {
-                    index_doc(&mut index_writer, i);
-                }
-                index_writer.commit().unwrap();
-            }
-        }
-        index
-    }
-
-    //#[bench]
-    // fn create_sorted_index_walk_overkmerge_on_merge_fastfield(
-    // b: &mut Bencher,
-    //) -> crate::Result<()> {
-    // let sort_by_field = IndexSortByField {
-    // field: "intval".to_string(),
-    // order: Order::Desc,
-    //};
-    // let index = create_index(Some(sort_by_field.clone()));
-    // let segments = index.searchable_segments().unwrap();
-    // let merger: IndexMerger =
-    // IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
-    // let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
-    // b.iter(|| {
-    // let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
-    // let reader = &merger.readers[doc_addr.segment_ord as usize];
-    // let u64_reader: Arc<dyn Column<u64>> = reader
-    //.fast_fields()
-    //.typed_fast_field_reader("intval")
-    //.expect(
-    //"Failed to find a reader for single fast field. This is a tantivy bug and \
-    // it should never happen.",
-    //);
-    //(doc_addr.doc_id, reader, u64_reader)
-    //});
-    /// add values in order of the new doc_ids
-    // let mut val = 0;
-    // for (doc_id, _reader, field_reader) in sorted_doc_ids {
-    // val = field_reader.get_val(doc_id);
-    //}
-
-    // val
-    //});
-
-    // Ok(())
-    //}
-    #[bench]
-    fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
-        let sort_by_field = IndexSortByField {
-            field: "intval".to_string(),
-            order: Order::Desc,
-        };
-        let index = create_index(Some(sort_by_field.clone()));
-        // let field = index.schema().get_field("intval").unwrap();
-        let segments = index.searchable_segments().unwrap();
-        let merger: IndexMerger =
-            IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
-        b.iter(|| {
-            merger
-                .generate_doc_id_mapping_with_sort_by_field(&sort_by_field)
-                .unwrap();
-        });
-
-        Ok(())
-    }
-}
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -13,10 +13,10 @@ mod flat_map_with_buffer;
 pub(crate) mod index_writer;
 pub(crate) mod index_writer_status;
 mod log_merge_policy;
+mod merge_index_test;
 mod merge_operation;
 pub(crate) mod merge_policy;
 pub(crate) mod merger;
-mod merger_sorted_index_test;
 pub(crate) mod operation;
 pub(crate) mod prepared_commit;
 mod segment_entry;
@@ -145,15 +145,27 @@ mod tests_mmap {
        }
    }
    #[test]
-    fn test_json_field_null_byte() {
-        // Test when field name contains a zero byte, which has special meaning in tantivy.
-        // As a workaround, we convert the zero byte to the ASCII character '0'.
-        // https://github.com/quickwit-oss/tantivy/issues/2340
-        // https://github.com/quickwit-oss/tantivy/issues/2193
-        let field_name_in = "\u{0000}";
-        let field_name_out = "0";
-        test_json_field_name(field_name_in, field_name_out);
+    fn test_json_field_null_byte_is_ignored() {
+        let mut schema_builder = Schema::builder();
+        let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
+        let field = schema_builder.add_json_field("json", options);
+        let index = Index::create_in_ram(schema_builder.build());
+        let mut index_writer = index.writer_for_tests().unwrap();
+        index_writer
+            .add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let segment_reader = searcher.segment_reader(0);
+        let inv_indexer = segment_reader.inverted_index(field).unwrap();
+        let term_dict = inv_indexer.terms();
+        assert_eq!(term_dict.num_terms(), 1);
+        let mut term_bytes = Vec::new();
+        term_dict.ord_to_term(0, &mut term_bytes).unwrap();
+        assert_eq!(term_bytes, b"key\0stest1");
    }
+
    #[test]
    fn test_json_field_1byte() {
        // Test when field name contains a '1' byte, which has special meaning in tantivy.
@@ -291,7 +303,7 @@ mod tests_mmap {
                Type::Str,
            ),
            (format!("{field_name_out_internal}a"), Type::Str),
-            (format!("{field_name_out_internal}"), Type::Str),
+            (field_name_out_internal.to_string(), Type::Str),
            (format!("num{field_name_out_internal}"), Type::I64),
        ];
        expected_fields.sort();
--- a/src/indexer/path_to_unordered_id.rs
+++ b/src/indexer/path_to_unordered_id.rs
@@ -38,7 +38,8 @@ impl PathToUnorderedId {
    #[cold]
    fn insert_new_path(&mut self, path: &str) -> u32 {
        let next_id = self.map.len() as u32;
-        self.map.insert(path.to_string(), next_id);
+        let new_path = path.to_string();
+        self.map.insert(new_path, next_id);
        next_id
    }

--- a/src/indexer/segment_serializer.rs
+++ b/src/indexer/segment_serializer.rs
@@ -18,27 +18,9 @@ pub struct SegmentSerializer {

 impl SegmentSerializer {
    /// Creates a new `SegmentSerializer`.
-    pub fn for_segment(
-        mut segment: Segment,
-        is_in_merge: bool,
-    ) -> crate::Result<SegmentSerializer> {
-        // If the segment is going to be sorted, we stream the docs first to a temporary file.
-        // In the merge case this is not necessary because we can kmerge the already sorted
-        // segments
-        let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
+    pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
        let settings = segment.index().settings().clone();
-        let store_writer = if remapping_required {
-            let store_write = segment.open_write(SegmentComponent::TempStore)?;
-            StoreWriter::new(
-                store_write,
-                crate::store::Compressor::None,
-                // We want fast random access on the docs, so we choose a small block size.
-                // If this is zero, the skip index will contain too many checkpoints and
-                // therefore will be relatively slow.
-                16000,
-                settings.docstore_compress_dedicated_thread,
-            )?
-        } else {
+        let store_writer = {
            let store_write = segment.open_write(SegmentComponent::Store)?;
            StoreWriter::new(
                store_write,
@@ -72,10 +54,6 @@ impl SegmentSerializer {
        &self.segment
    }

-    pub fn segment_mut(&mut self) -> &mut Segment {
-        &mut self.segment
-    }
-
    /// Accessor to the `PostingsSerializer`.
    pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
        &mut self.postings_serializer
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -115,11 +115,10 @@ fn merge(
        .collect();

    // An IndexMerger is like a "view" of our merged segments.
-    let merger: IndexMerger =
-        IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
+    let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;

    // ... we just serialize this index merger in our new segment to merge the segments.
-    let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
+    let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;

    let num_docs = merger.write(segment_serializer)?;

@@ -220,13 +219,9 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
    )?;
    let merged_segment = merged_index.new_segment();
    let merged_segment_id = merged_segment.id();
-    let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
-        merged_index.schema(),
-        merged_index.settings().clone(),
-        segments,
-        filter_doc_ids,
-    )?;
-    let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
+    let merger: IndexMerger =
+        IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
+    let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
    let num_docs = merger.write(segment_serializer)?;

    let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
@@ -1067,7 +1062,6 @@ mod tests {
            )?;
            let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
                merged_index.schema(),
-                merged_index.settings().clone(),
                &segments[..],
                filter_segments,
            )?;
@@ -1083,7 +1077,6 @@ mod tests {
                Index::create(RamDirectory::default(), target_schema, target_settings)?;
            let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
                merged_index.schema(),
-                merged_index.settings().clone(),
                &segments[..],
                filter_segments,
            )?;
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -3,7 +3,6 @@ use common::JsonPathWriter;
 use itertools::Itertools;
 use tokenizer_api::BoxTokenStream;

-use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
 use super::operation::AddOperation;
 use crate::fastfield::FastFieldsWriter;
 use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
@@ -16,7 +15,6 @@ use crate::postings::{
 };
 use crate::schema::document::{Document, Value};
 use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
-use crate::store::{StoreReader, StoreWriter};
 use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
 use crate::{DocId, Opstamp, TantivyError};

@@ -41,20 +39,6 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
        })
 }

-fn remap_doc_opstamps(
-    opstamps: Vec<Opstamp>,
-    doc_id_mapping_opt: Option<&DocIdMapping>,
-) -> Vec<Opstamp> {
-    if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
-        doc_id_mapping_opt
-            .iter_old_doc_ids()
-            .map(|doc| opstamps[doc as usize])
-            .collect()
-    } else {
-        opstamps
-    }
-}
-
 /// A `SegmentWriter` is in charge of creating segment index from a
 /// set of documents.
 ///
@@ -90,7 +74,7 @@ impl SegmentWriter {
        let tokenizer_manager = segment.index().tokenizers().clone();
        let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
        let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
-        let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
+        let segment_serializer = SegmentSerializer::for_segment(segment)?;
        let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
        let per_field_text_analyzers = schema
            .fields()
@@ -139,15 +123,6 @@ impl SegmentWriter {
    /// be used afterwards.
    pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
        self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
-        let mapping: Option<DocIdMapping> = self
-            .segment_serializer
-            .segment()
-            .index()
-            .settings()
-            .sort_by_field
-            .clone()
-            .map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
-            .transpose()?;
        remap_and_write(
            self.schema,
            &self.per_field_postings_writers,
@@ -155,10 +130,8 @@ impl SegmentWriter {
            self.fast_field_writers,
            &self.fieldnorms_writer,
            self.segment_serializer,
-            mapping.as_ref(),
        )?;
-        let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
-        Ok(doc_opstamps)
+        Ok(self.doc_opstamps)
    }

    /// Returns an estimation of the current memory usage of the segment writer.
@@ -202,9 +175,8 @@ impl SegmentWriter {
            match field_entry.field_type() {
                FieldType::Facet(_) => {
                    let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
                        let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +192,14 @@ impl SegmentWriter {
                }
                FieldType::Str(_) => {
                    let mut indexing_position = IndexingPosition::default();
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let mut token_stream = if let Some(text) = value.as_str() {
                            let text_analyzer =
                                &mut self.per_field_text_analyzers[field.field_id() as usize];
                            text_analyzer.token_stream(text)
-                        } else if let Some(tok_str) = value.as_pre_tokenized_text() {
+                        } else if let Some(tok_str) = value.into_pre_tokenized_text() {
                            BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
                        } else {
                            continue;
@@ -250,9 +221,8 @@ impl SegmentWriter {
                }
                FieldType::U64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +235,8 @@ impl SegmentWriter {
                }
                FieldType::Date(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value_access = value_access as D::Value<'_>;
-                        let value = value_access.as_value();
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +250,8 @@ impl SegmentWriter {
                }
                FieldType::I64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +264,8 @@ impl SegmentWriter {
                }
                FieldType::F64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
                        term_buffer.set_f64(f64_val);
@@ -312,10 +277,8 @@ impl SegmentWriter {
                }
                FieldType::Bool(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
                        term_buffer.set_bool(bool_val);
@@ -327,10 +290,8 @@ impl SegmentWriter {
                }
                FieldType::Bytes(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
                        term_buffer.set_bytes(bytes);
@@ -364,9 +325,8 @@ impl SegmentWriter {
                }
                FieldType::IpAddr(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -432,11 +392,10 @@ fn remap_and_write(
    fast_field_writers: FastFieldsWriter,
    fieldnorms_writer: &FieldNormsWriter,
    mut serializer: SegmentSerializer,
-    doc_id_map: Option<&DocIdMapping>,
 ) -> crate::Result<()> {
    debug!("remap-and-write");
    if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
-        fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
+        fieldnorms_writer.serialize(fieldnorms_serializer)?;
    }
    let fieldnorm_data = serializer
        .segment()
@@ -447,39 +406,10 @@ fn remap_and_write(
        schema,
        per_field_postings_writers,
        fieldnorm_readers,
-        doc_id_map,
        serializer.get_postings_serializer(),
    )?;
    debug!("fastfield-serialize");
-    fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
-
-    // finalize temp docstore and create version, which reflects the doc_id_map
-    if let Some(doc_id_map) = doc_id_map {
-        debug!("resort-docstore");
-        let store_write = serializer
-            .segment_mut()
-            .open_write(SegmentComponent::Store)?;
-        let settings = serializer.segment().index().settings();
-        let store_writer = StoreWriter::new(
-            store_write,
-            settings.docstore_compression,
-            settings.docstore_blocksize,
-            settings.docstore_compress_dedicated_thread,
-        )?;
-        let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer);
-        old_store_writer.close()?;
-        let store_read = StoreReader::open(
-            serializer
-                .segment()
-                .open_read(SegmentComponent::TempStore)?,
-            1, /* The docstore is configured to have one doc per block, and each doc is accessed
-                * only once: we don't need caching. */
-        )?;
-        for old_doc_id in doc_id_map.iter_old_doc_ids() {
-            let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
-            serializer.get_store_writer().store_bytes(&doc_bytes)?;
-        }
-    }
+    fast_field_writers.serialize(serializer.get_fast_field_write())?;

    debug!("serializer-close");
    serializer.close()?;
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -222,8 +222,8 @@ pub use crate::core::{Executor, Searcher, SearcherGeneration};
 pub use crate::directory::Directory;
 #[allow(deprecated)] // Remove with index sorting
 pub use crate::index::{
-    Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
-    Segment, SegmentMeta, SegmentReader,
+    Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
+    SegmentMeta, SegmentReader,
 };
 pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
 pub use crate::schema::{Document, TantivyDocument, Term};
@@ -397,16 +397,20 @@ pub mod tests {
    #[macro_export]
    macro_rules! assert_nearly_equals {
        ($left:expr, $right:expr) => {{
-            match (&$left, &$right) {
-                (left_val, right_val) => {
+            assert_nearly_equals!($left, $right, 0.0005);
+        }};
+        ($left:expr, $right:expr, $epsilon:expr) => {{
+            match (&$left, &$right, &$epsilon) {
+                (left_val, right_val, epsilon_val) => {
                    let diff = (left_val - right_val).abs();
-                    let add = left_val.abs() + right_val.abs();
-                    if diff > 0.0005 * add {
+
+                    if diff > *epsilon_val {
                        panic!(
-                            r#"assertion failed: `(left ~= right)`
-  left: `{:?}`,
- right: `{:?}`"#,
-                            &*left_val, &*right_val
+                            r#"assertion failed: `abs(left-right)>epsilon`
+    left: `{:?}`,
+    right: `{:?}`,
+    epsilon: `{:?}`"#,
+                            &*left_val, &*right_val, &*epsilon_val
                        )
                    }
                }
--- a/src/positions/mod.rs
+++ b/src/positions/mod.rs
@@ -3,7 +3,7 @@
 //! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
 //! This information is useful to run phrase queries.
 //!
-//! The [position](crate::SegmentComponent::Positions) file contains all of the
+//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
 //! bitpacked positions delta, for all terms of a given field, one term after the other.
 //!
 //! Each term is encoded independently.
--- a/src/postings/json_postings_writer.rs
+++ b/src/postings/json_postings_writer.rs
@@ -3,7 +3,6 @@ use std::io;
 use common::json_path_writer::JSON_END_OF_PATH;
 use stacker::Addr;

-use crate::indexer::doc_id_mapping::DocIdMapping;
 use crate::indexer::path_to_unordered_id::OrderedPathId;
 use crate::postings::postings_writer::SpecializedPostingsWriter;
 use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
@@ -60,9 +59,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
    /// The actual serialization format is handled by the `PostingsSerializer`.
    fn serialize(
        &self,
-        term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
+        ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
        ordered_id_to_path: &[&str],
-        doc_id_map: Option<&DocIdMapping>,
        ctx: &IndexingContext,
        serializer: &mut FieldSerializer,
    ) -> io::Result<()> {
@@ -71,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
        term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
        let mut prev_term_id = u32::MAX;
        let mut term_path_len = 0; // this will be set in the first iteration
-        for (_field, path_id, term, addr) in term_addrs {
+        for (_field, path_id, term, addr) in ordered_term_addrs {
            if prev_term_id != path_id.path_id() {
                term_buffer.truncate_value_bytes(0);
                term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
@@ -87,7 +85,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
                    SpecializedPostingsWriter::<Rec>::serialize_one_term(
                        term_buffer.serialized_value_bytes(),
                        *addr,
-                        doc_id_map,
                        &mut buffer_lender,
                        ctx,
                        serializer,
@@ -96,7 +93,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
                    SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
                        term_buffer.serialized_value_bytes(),
                        *addr,
-                        doc_id_map,
                        &mut buffer_lender,
                        ctx,
                        serializer,
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
    fn term_freq(&self) -> u32;

    /// Returns the positions offsetted with a given value.
+    /// It is not necessary to clear the `output` before calling this method.
    /// The output vector will be resized to the `term_freq`.
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -5,7 +5,6 @@ use std::ops::Range;
 use stacker::Addr;

 use crate::fieldnorm::FieldNormReaders;
-use crate::indexer::doc_id_mapping::DocIdMapping;
 use crate::indexer::path_to_unordered_id::OrderedPathId;
 use crate::postings::recorder::{BufferLender, Recorder};
 use crate::postings::{
@@ -50,7 +49,6 @@ pub(crate) fn serialize_postings(
    schema: Schema,
    per_field_postings_writers: &PerFieldPostingsWriter,
    fieldnorm_readers: FieldNormReaders,
-    doc_id_map: Option<&DocIdMapping>,
    serializer: &mut InvertedIndexSerializer,
 ) -> crate::Result<()> {
    // Replace unordered ids by ordered ids to be able to sort
@@ -86,7 +84,6 @@ pub(crate) fn serialize_postings(
        postings_writer.serialize(
            &term_offsets[byte_offsets],
            &ordered_id_to_path,
-            doc_id_map,
            &ctx,
            &mut field_serializer,
        )?;
@@ -122,7 +119,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
        &self,
        term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
        ordered_id_to_path: &[&str],
-        doc_id_map: Option<&DocIdMapping>,
        ctx: &IndexingContext,
        serializer: &mut FieldSerializer,
    ) -> io::Result<()>;
@@ -187,7 +183,6 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
    pub(crate) fn serialize_one_term(
        term: &[u8],
        addr: Addr,
-        doc_id_map: Option<&DocIdMapping>,
        buffer_lender: &mut BufferLender,
        ctx: &IndexingContext,
        serializer: &mut FieldSerializer,
@@ -195,7 +190,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
        let recorder: Rec = ctx.term_index.read(addr);
        let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
        serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
-        recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
+        recorder.serialize(&ctx.arena, serializer, buffer_lender);
        serializer.close_term()?;
        Ok(())
    }
@@ -229,13 +224,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
        &self,
        term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
        _ordered_id_to_path: &[&str],
-        doc_id_map: Option<&DocIdMapping>,
        ctx: &IndexingContext,
        serializer: &mut FieldSerializer,
    ) -> io::Result<()> {
        let mut buffer_lender = BufferLender::default();
        for (_field, _path_id, term, addr) in term_addrs {
-            Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
+            Self::serialize_one_term(term, *addr, &mut buffer_lender, ctx, serializer)?;
        }
        Ok(())
    }
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -1,7 +1,6 @@
 use common::read_u32_vint;
 use stacker::{ExpUnrolledLinkedList, MemoryArena};

-use crate::indexer::doc_id_mapping::DocIdMapping;
 use crate::postings::FieldSerializer;
 use crate::DocId;

@@ -71,7 +70,6 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
    fn serialize(
        &self,
        arena: &MemoryArena,
-        doc_id_map: Option<&DocIdMapping>,
        serializer: &mut FieldSerializer<'_>,
        buffer_lender: &mut BufferLender,
    );
@@ -115,26 +113,15 @@ impl Recorder for DocIdRecorder {
    fn serialize(
        &self,
        arena: &MemoryArena,
-        doc_id_map: Option<&DocIdMapping>,
        serializer: &mut FieldSerializer<'_>,
        buffer_lender: &mut BufferLender,
    ) {
-        let (buffer, doc_ids) = buffer_lender.lend_all();
+        let buffer = buffer_lender.lend_u8();
        // TODO avoid reading twice.
        self.stack.read_to_end(arena, buffer);
-        if let Some(doc_id_map) = doc_id_map {
-            let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
-            doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)));
-            doc_ids.sort_unstable();
-
-            for doc in doc_ids {
-                serializer.write_doc(*doc, 0u32, &[][..]);
-            }
-        } else {
-            let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
-            for doc_id in iter {
-                serializer.write_doc(doc_id, 0u32, &[][..]);
-            }
+        let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
+        for doc_id in iter {
+            serializer.write_doc(doc_id, 0u32, &[][..]);
        }
    }

@@ -194,35 +181,18 @@ impl Recorder for TermFrequencyRecorder {
    fn serialize(
        &self,
        arena: &MemoryArena,
-        doc_id_map: Option<&DocIdMapping>,
        serializer: &mut FieldSerializer<'_>,
        buffer_lender: &mut BufferLender,
    ) {
        let buffer = buffer_lender.lend_u8();
        self.stack.read_to_end(arena, buffer);
        let mut u32_it = VInt32Reader::new(&buffer[..]);
-        if let Some(doc_id_map) = doc_id_map {
-            let mut doc_id_and_tf = vec![];
-            let mut prev_doc = 0;
-            while let Some(delta_doc_id) = u32_it.next() {
-                let doc_id = prev_doc + delta_doc_id;
-                prev_doc = doc_id;
-                let term_freq = u32_it.next().unwrap_or(self.current_tf);
-                doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
-            }
-            doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
-
-            for (doc_id, tf) in doc_id_and_tf {
-                serializer.write_doc(doc_id, tf, &[][..]);
-            }
-        } else {
-            let mut prev_doc = 0;
-            while let Some(delta_doc_id) = u32_it.next() {
-                let doc_id = prev_doc + delta_doc_id;
-                prev_doc = doc_id;
-                let term_freq = u32_it.next().unwrap_or(self.current_tf);
-                serializer.write_doc(doc_id, term_freq, &[][..]);
-            }
+        let mut prev_doc = 0;
+        while let Some(delta_doc_id) = u32_it.next() {
+            let doc_id = prev_doc + delta_doc_id;
+            prev_doc = doc_id;
+            let term_freq = u32_it.next().unwrap_or(self.current_tf);
+            serializer.write_doc(doc_id, term_freq, &[][..]);
        }
    }

@@ -268,14 +238,12 @@ impl Recorder for TfAndPositionRecorder {
    fn serialize(
        &self,
        arena: &MemoryArena,
-        doc_id_map: Option<&DocIdMapping>,
        serializer: &mut FieldSerializer<'_>,
        buffer_lender: &mut BufferLender,
    ) {
        let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
        self.stack.read_to_end(arena, buffer_u8);
        let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
-        let mut doc_id_and_positions = vec![];
        let mut prev_doc = 0;
        while let Some(delta_doc_id) = u32_it.next() {
            let doc_id = prev_doc + delta_doc_id;
@@ -294,19 +262,7 @@ impl Recorder for TfAndPositionRecorder {
                    }
                }
            }
-            if let Some(doc_id_map) = doc_id_map {
-                // this simple variant to remap may consume to much memory
-                doc_id_and_positions
-                    .push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
-            } else {
-                serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
-            }
-        }
-        if doc_id_map.is_some() {
-            doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id);
-            for (doc_id, positions) in doc_id_and_positions {
-                serializer.write_doc(doc_id, positions.len() as u32, &positions);
-            }
+            serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
        }
    }

--- a/src/query/all_query.rs
+++ b/src/query/all_query.rs
@@ -22,10 +22,7 @@ pub struct AllWeight;

 impl Weight for AllWeight {
    fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
-        let all_scorer = AllScorer {
-            doc: 0u32,
-            max_doc: reader.max_doc(),
-        };
+        let all_scorer = AllScorer::new(reader.max_doc());
        Ok(Box::new(BoostScorer::new(all_scorer, boost)))
    }

@@ -43,6 +40,13 @@ pub struct AllScorer {
    max_doc: DocId,
 }

+impl AllScorer {
+    /// Creates a new AllScorer with `max_doc` docs.
+    pub fn new(max_doc: DocId) -> AllScorer {
+        AllScorer { doc: 0u32, max_doc }
+    }
+}
+
 impl DocSet for AllScorer {
    #[inline(always)]
    fn advance(&mut self) -> DocId {
--- a/src/query/boolean_query/boolean_query.rs
+++ b/src/query/boolean_query/boolean_query.rs
@@ -66,6 +66,10 @@ use crate::schema::{IndexRecordOption, Term};
 ///        Term::from_field_text(title, "diary"),
 ///        IndexRecordOption::Basic,
 ///    ));
+///    let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
+///        Term::from_field_text(title, "cow"),
+///        IndexRecordOption::Basic
+///    ));
 ///    // A TermQuery with "found" in the body
 ///    let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
 ///        Term::from_field_text(body, "found"),
@@ -74,7 +78,7 @@ use crate::schema::{IndexRecordOption, Term};
 ///    // TermQuery "diary" must and "girl" must not be present
 ///    let queries_with_occurs1 = vec![
 ///        (Occur::Must, diary_term_query.box_clone()),
-///        (Occur::MustNot, girl_term_query),
+///        (Occur::MustNot, girl_term_query.box_clone()),
 ///    ];
 ///    // Make a BooleanQuery equivalent to
 ///    // title:+diary title:-girl
@@ -82,15 +86,10 @@ use crate::schema::{IndexRecordOption, Term};
 ///    let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
 ///    assert_eq!(count1, 1);
 ///
-///    // TermQuery for "cow" in the title
-///    let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
-///        Term::from_field_text(title, "cow"),
-///        IndexRecordOption::Basic,
-///    ));
 ///    // "title:diary OR title:cow"
 ///    let title_diary_or_cow = BooleanQuery::new(vec![
 ///        (Occur::Should, diary_term_query.box_clone()),
-///        (Occur::Should, cow_term_query),
+///        (Occur::Should, cow_term_query.box_clone()),
 ///    ]);
 ///    let count2 = searcher.search(&title_diary_or_cow, &Count)?;
 ///    assert_eq!(count2, 4);
@@ -118,21 +117,38 @@ use crate::schema::{IndexRecordOption, Term};
 ///    ]);
 ///    let count4 = searcher.search(&nested_query, &Count)?;
 ///    assert_eq!(count4, 1);
+///
+///    // You may call `with_minimum_required_clauses` to
+///    // specify the number of should clauses the returned documents must match.
+///    let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
+///         (Occur::Should, cow_term_query.box_clone()),
+///         (Occur::Should, girl_term_query.box_clone()),
+///         (Occur::Should, diary_term_query.box_clone()),
+///    ], 2);
+///    // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
+///    // Notice: "Diary" isn't "Dairy". ;-)
+///    let count5 = searcher.search(&minimum_required_query, &Count)?;
+///    assert_eq!(count5, 1);
 ///    Ok(())
 /// }
 /// ```
 #[derive(Debug)]
 pub struct BooleanQuery {
    subqueries: Vec<(Occur, Box<dyn Query>)>,
+    minimum_number_should_match: usize,
 }

 impl Clone for BooleanQuery {
    fn clone(&self) -> Self {
-        self.subqueries
+        let subqueries = self
+            .subqueries
            .iter()
            .map(|(occur, subquery)| (*occur, subquery.box_clone()))
-            .collect::<Vec<_>>()
-            .into()
+            .collect::<Vec<_>>();
+        Self {
+            subqueries,
+            minimum_number_should_match: self.minimum_number_should_match,
+        }
    }
 }

@@ -149,8 +165,9 @@ impl Query for BooleanQuery {
            .iter()
            .map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
            .collect::<crate::Result<_>>()?;
-        Ok(Box::new(BooleanWeight::new(
+        Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
            sub_weights,
+            self.minimum_number_should_match,
            enable_scoring.is_scoring_enabled(),
            Box::new(SumWithCoordsCombiner::default),
        )))
@@ -166,7 +183,41 @@ impl Query for BooleanQuery {
 impl BooleanQuery {
    /// Creates a new boolean query.
    pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
-        BooleanQuery { subqueries }
+        // If the bool query includes at least one should clause
+        // and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
+        // 0. Keep pace with Elasticsearch.
+        let mut minimum_required = 0;
+        for (occur, _) in &subqueries {
+            match occur {
+                Occur::Should => minimum_required = 1,
+                Occur::Must | Occur::MustNot => {
+                    minimum_required = 0;
+                    break;
+                }
+            }
+        }
+        Self::with_minimum_required_clauses(subqueries, minimum_required)
+    }
+
+    /// Create a new boolean query with minimum number of required should clauses specified.
+    pub fn with_minimum_required_clauses(
+        subqueries: Vec<(Occur, Box<dyn Query>)>,
+        minimum_number_should_match: usize,
+    ) -> BooleanQuery {
+        BooleanQuery {
+            subqueries,
+            minimum_number_should_match,
+        }
+    }
+
+    /// Getter for `minimum_number_should_match`
+    pub fn get_minimum_number_should_match(&self) -> usize {
+        self.minimum_number_should_match
+    }
+
+    /// Setter for `minimum_number_should_match`
+    pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
+        self.minimum_number_should_match = minimum_number_should_match;
    }

    /// Returns the intersection of the queries.
@@ -181,6 +232,18 @@ impl BooleanQuery {
        BooleanQuery::new(subqueries)
    }

+    /// Returns the union of the queries with minimum required clause.
+    pub fn union_with_minimum_required_clauses(
+        queries: Vec<Box<dyn Query>>,
+        minimum_required_clauses: usize,
+    ) -> BooleanQuery {
+        let subqueries = queries
+            .into_iter()
+            .map(|sub_query| (Occur::Should, sub_query))
+            .collect();
+        BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
+    }
+
    /// Helper method to create a boolean query matching a given list of terms.
    /// The resulting query is a disjunction of the terms.
    pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
@@ -203,11 +266,13 @@ impl BooleanQuery {

 #[cfg(test)]
 mod tests {
+    use std::collections::HashSet;
+
    use super::BooleanQuery;
    use crate::collector::{Count, DocSetCollector};
-    use crate::query::{QueryClone, QueryParser, TermQuery};
-    use crate::schema::{IndexRecordOption, Schema, TEXT};
-    use crate::{DocAddress, Index, Term};
+    use crate::query::{Query, QueryClone, QueryParser, TermQuery};
+    use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
+    use crate::{DocAddress, DocId, Index, Term};

    fn create_test_index() -> crate::Result<Index> {
        let mut schema_builder = Schema::builder();
@@ -223,6 +288,73 @@ mod tests {
        Ok(index)
    }

+    #[test]
+    fn test_minimum_required() -> crate::Result<()> {
+        fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
+            docs: T,
+        ) -> crate::Result<Index> {
+            let mut schema_builder = Schema::builder();
+            let text = schema_builder.add_text_field("text", TEXT);
+            let schema = schema_builder.build();
+            let index = Index::create_in_ram(schema);
+            let mut writer = index.writer_for_tests()?;
+            for doc in docs {
+                writer.add_document(doc!(text => doc))?;
+            }
+            writer.commit()?;
+            Ok(index)
+        }
+        fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
+            queries: T,
+            field: Field,
+            mr: usize,
+        ) -> BooleanQuery {
+            let terms = queries
+                .into_iter()
+                .map(|t| Term::from_field_text(field, t))
+                .map(|t| TermQuery::new(t, IndexRecordOption::Basic))
+                .map(|q| -> Box<dyn Query> { Box::new(q) })
+                .collect();
+            BooleanQuery::union_with_minimum_required_clauses(terms, mr)
+        }
+        fn check_doc_id<T: IntoIterator<Item = DocId>>(
+            expected: T,
+            actually: HashSet<DocAddress>,
+            seg: u32,
+        ) {
+            assert_eq!(
+                actually,
+                expected
+                    .into_iter()
+                    .map(|id| DocAddress::new(seg, id))
+                    .collect()
+            );
+        }
+        let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
+        let searcher = index.reader()?.searcher();
+        let text = index.schema().get_field("text").unwrap();
+        // Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
+        let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
+        let docs = searcher.search(&q1, &DocSetCollector)?;
+        check_doc_id([0, 1, 4], docs, 0);
+        // Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
+        let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
+        let docs = searcher.search(&q2, &DocSetCollector)?;
+        check_doc_id([0, 1], docs, 0);
+        // Nothing queried since minimum_required is too large.
+        let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
+        let docs = searcher.search(&q3, &DocSetCollector)?;
+        assert!(docs.is_empty());
+        // When mr is set to zero or one, there are no difference with `Boolean::Union`.
+        let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
+        let docs = searcher.search(&q4, &DocSetCollector)?;
+        check_doc_id([0, 1, 3], docs, 0);
+        let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
+        let docs = searcher.search(&q5, &DocSetCollector)?;
+        check_doc_id([0, 1, 4], docs, 0);
+        Ok(())
+    }
+
    #[test]
    fn test_union() -> crate::Result<()> {
        let index = create_test_index()?;
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -3,6 +3,7 @@ use std::collections::HashMap;
 use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
 use crate::index::SegmentReader;
 use crate::postings::FreqReadingOption;
+use crate::query::disjunction::Disjunction;
 use crate::query::explanation::does_not_match;
 use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
 use crate::query::term_query::TermScorer;
@@ -18,6 +19,26 @@ enum SpecializedScorer {
    Other(Box<dyn Scorer>),
 }

+fn scorer_disjunction<TScoreCombiner>(
+    scorers: Vec<Box<dyn Scorer>>,
+    score_combiner: TScoreCombiner,
+    minimum_match_required: usize,
+) -> Box<dyn Scorer>
+where
+    TScoreCombiner: ScoreCombiner,
+{
+    debug_assert!(!scorers.is_empty());
+    debug_assert!(minimum_match_required > 1);
+    if scorers.len() == 1 {
+        return scorers.into_iter().next().unwrap(); // Safe unwrap.
+    }
+    Box::new(Disjunction::new(
+        scorers,
+        score_combiner,
+        minimum_match_required,
+    ))
+}
+
 fn scorer_union<TScoreCombiner>(
    scorers: Vec<Box<dyn Scorer>>,
    score_combiner_fn: impl Fn() -> TScoreCombiner,
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
 /// Weight associated to the `BoolQuery`.
 pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
    weights: Vec<(Occur, Box<dyn Weight>)>,
+    minimum_number_should_match: usize,
    scoring_enabled: bool,
    score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
 }
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
            weights,
            scoring_enabled,
            score_combiner_fn,
+            minimum_number_should_match: 1,
+        }
+    }
+
+    /// Create a new boolean weight with minimum number of required should clauses specified.
+    pub fn with_minimum_number_should_match(
+        weights: Vec<(Occur, Box<dyn Weight>)>,
+        minimum_number_should_match: usize,
+        scoring_enabled: bool,
+        score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
+    ) -> BooleanWeight<TScoreCombiner> {
+        BooleanWeight {
+            weights,
+            minimum_number_should_match,
+            scoring_enabled,
+            score_combiner_fn,
        }
    }

@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
        score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
    ) -> crate::Result<SpecializedScorer> {
        let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
-
-        let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
-            .remove(&Occur::Should)
-            .map(|scorers| scorer_union(scorers, &score_combiner_fn));
+        // Indicate how should clauses are combined with other clauses.
+        enum CombinationMethod {
+            Ignored,
+            // Only contributes to final score.
+            Optional(SpecializedScorer),
+            // Must be fitted.
+            Required(Box<dyn Scorer>),
+        }
+        let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
+        let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
+        {
+            let num_of_should_scorers = should_scorers.len();
+            if self.minimum_number_should_match > num_of_should_scorers {
+                return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
+            }
+            match self.minimum_number_should_match {
+                0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
+                1 => CombinationMethod::Required(into_box_scorer(
+                    scorer_union(should_scorers, &score_combiner_fn),
+                    &score_combiner_fn,
+                )),
+                n if num_of_should_scorers == n => {
+                    // When num_of_should_scorers equals the number of should clauses,
+                    // they are no different from must clauses.
+                    must_scorers = match must_scorers.take() {
+                        Some(mut must_scorers) => {
+                            must_scorers.append(&mut should_scorers);
+                            Some(must_scorers)
+                        }
+                        None => Some(should_scorers),
+                    };
+                    CombinationMethod::Ignored
+                }
+                _ => CombinationMethod::Required(scorer_disjunction(
+                    should_scorers,
+                    score_combiner_fn(),
+                    self.minimum_number_should_match,
+                )),
+            }
+        } else {
+            // None of should clauses are provided.
+            if self.minimum_number_should_match > 0 {
+                return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
+            } else {
+                CombinationMethod::Ignored
+            }
+        };
        let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
            .remove(&Occur::MustNot)
            .map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
-            .map(|specialized_scorer| {
+            .map(|specialized_scorer: SpecializedScorer| {
                into_box_scorer(specialized_scorer, DoNothingCombiner::default)
            });
-
-        let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
-            .remove(&Occur::Must)
-            .map(intersect_scorers);
-
-        let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
-            (Some(should_scorer), Some(must_scorer)) => {
+        let positive_scorer = match (should_opt, must_scorers) {
+            (CombinationMethod::Ignored, Some(must_scorers)) => {
+                SpecializedScorer::Other(intersect_scorers(must_scorers))
+            }
+            (CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
+                let must_scorer = intersect_scorers(must_scorers);
                if self.scoring_enabled {
-                    SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
-                        Box<dyn Scorer>,
-                        Box<dyn Scorer>,
-                        TComplexScoreCombiner,
-                    >::new(
-                        must_scorer,
-                        into_box_scorer(should_scorer, &score_combiner_fn),
-                    )))
+                    SpecializedScorer::Other(Box::new(
+                        RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
+                            must_scorer,
+                            into_box_scorer(should_scorer, &score_combiner_fn),
+                        ),
+                    ))
                } else {
                    SpecializedScorer::Other(must_scorer)
                }
            }
-            (None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
-            (Some(should_scorer), None) => should_scorer,
-            (None, None) => {
-                return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
+            (CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
+                must_scorers.push(should_scorer);
+                SpecializedScorer::Other(intersect_scorers(must_scorers))
            }
+            (CombinationMethod::Ignored, None) => {
+                return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
+            }
+            (CombinationMethod::Required(should_scorer), None) => {
+                SpecializedScorer::Other(should_scorer)
+            }
+            // Optional options are promoted to required if no must scorers exists.
+            (CombinationMethod::Optional(should_scorer), None) => should_scorer,
        };
-
        if let Some(exclude_scorer) = exclude_scorer_opt {
            let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
            Ok(SpecializedScorer::Other(Box::new(Exclude::new(
--- a/src/query/disjunction.rs
+++ b/src/query/disjunction.rs
@@ -0,0 +1,327 @@
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+use crate::query::score_combiner::DoNothingCombiner;
+use crate::query::{ScoreCombiner, Scorer};
+use crate::{DocId, DocSet, Score, TERMINATED};
+
+/// `Disjunction` is responsible for merging `DocSet` from multiple
+/// source. Specifically, It takes the union of two or more `DocSet`s
+/// then filtering out elements that appear fewer times than a
+/// specified threshold.
+pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
+    chains: BinaryHeap<ScorerWrapper<TScorer>>,
+    minimum_matches_required: usize,
+    score_combiner: TScoreCombiner,
+
+    current_doc: DocId,
+    current_score: Score,
+}
+
+/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
+/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
+/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
+struct ScorerWrapper<T> {
+    scorer: T,
+    current_doc: DocId,
+}
+
+impl<T: Scorer> ScorerWrapper<T> {
+    fn new(scorer: T) -> Self {
+        let current_doc = scorer.doc();
+        Self {
+            scorer,
+            current_doc,
+        }
+    }
+}
+
+impl<T: Scorer> PartialEq for ScorerWrapper<T> {
+    fn eq(&self, other: &Self) -> bool {
+        self.doc() == other.doc()
+    }
+}
+
+impl<T: Scorer> Eq for ScorerWrapper<T> {}
+
+impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<T: Scorer> Ord for ScorerWrapper<T> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.doc().cmp(&other.doc()).reverse()
+    }
+}
+
+impl<T: Scorer> DocSet for ScorerWrapper<T> {
+    fn advance(&mut self) -> DocId {
+        let doc_id = self.scorer.advance();
+        self.current_doc = doc_id;
+        doc_id
+    }
+
+    fn doc(&self) -> DocId {
+        self.current_doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.scorer.size_hint()
+    }
+}
+
+impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
+    pub fn new<T: IntoIterator<Item = TScorer>>(
+        docsets: T,
+        score_combiner: TScoreCombiner,
+        minimum_matches_required: usize,
+    ) -> Self {
+        debug_assert!(
+            minimum_matches_required > 1,
+            "union scorer works better if just one matches required"
+        );
+        let chains = docsets
+            .into_iter()
+            .map(|doc| ScorerWrapper::new(doc))
+            .collect();
+        let mut disjunction = Self {
+            chains,
+            score_combiner,
+            current_doc: TERMINATED,
+            minimum_matches_required,
+            current_score: 0.0,
+        };
+        if minimum_matches_required > disjunction.chains.len() {
+            return disjunction;
+        }
+        disjunction.advance();
+        disjunction
+    }
+}
+
+impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
+    for Disjunction<TScorer, TScoreCombiner>
+{
+    fn advance(&mut self) -> DocId {
+        let mut current_num_matches = 0;
+        while let Some(mut candidate) = self.chains.pop() {
+            let next = candidate.doc();
+            if next != TERMINATED {
+                // Peek next doc.
+                if self.current_doc != next {
+                    if current_num_matches >= self.minimum_matches_required {
+                        self.chains.push(candidate);
+                        self.current_score = self.score_combiner.score();
+                        return self.current_doc;
+                    }
+                    // Reset current_num_matches and scores.
+                    current_num_matches = 0;
+                    self.current_doc = next;
+                    self.score_combiner.clear();
+                }
+                current_num_matches += 1;
+                self.score_combiner.update(&mut candidate.scorer);
+                candidate.advance();
+                self.chains.push(candidate);
+            }
+        }
+        if current_num_matches < self.minimum_matches_required {
+            self.current_doc = TERMINATED;
+        }
+        self.current_score = self.score_combiner.score();
+        self.current_doc
+    }
+
+    #[inline]
+    fn doc(&self) -> DocId {
+        self.current_doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.chains
+            .iter()
+            .map(|docset| docset.size_hint())
+            .max()
+            .unwrap_or(0u32)
+    }
+}
+
+impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
+    for Disjunction<TScorer, TScoreCombiner>
+{
+    fn score(&mut self) -> Score {
+        self.current_score
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeMap;
+
+    use super::Disjunction;
+    use crate::query::score_combiner::DoNothingCombiner;
+    use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
+    use crate::{DocId, DocSet, Score, TERMINATED};
+
+    fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
+        let mut counts = BTreeMap::new();
+        for array in arrays {
+            for &element in array {
+                *counts.entry(element).or_insert(0) += 1;
+            }
+        }
+        counts
+            .iter()
+            .filter_map(|(&element, &count)| {
+                if count >= pass_line {
+                    Some(element)
+                } else {
+                    None
+                }
+            })
+            .collect()
+    }
+
+    fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
+        let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
+        let make_scorer = || {
+            Disjunction::new(
+                vals.iter()
+                    .cloned()
+                    .map(VecDocSet::from)
+                    .map(|d| ConstScorer::new(d, 1.0)),
+                DoNothingCombiner,
+                min_match,
+            )
+        };
+        let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
+        let mut count = 0;
+        while scorer.doc() != TERMINATED {
+            assert_eq!(union_expected.doc(), scorer.doc());
+            assert_eq!(union_expected.advance(), scorer.advance());
+            count += 1;
+        }
+        assert_eq!(union_expected.advance(), TERMINATED);
+        assert_eq!(count, make_scorer().count_including_deleted());
+    }
+
+    #[should_panic]
+    #[test]
+    fn test_arg_check1() {
+        aux_test_conjunction(vec![], 0);
+    }
+
+    #[should_panic]
+    #[test]
+    fn test_arg_check2() {
+        aux_test_conjunction(vec![], 1);
+    }
+
+    #[test]
+    fn test_corner_case() {
+        aux_test_conjunction(vec![], 2);
+        aux_test_conjunction(vec![vec![]; 1000], 2);
+        aux_test_conjunction(vec![vec![]; 100], usize::MAX);
+        aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
+        aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
+    }
+
+    #[test]
+    fn test_conjunction() {
+        aux_test_conjunction(
+            vec![
+                vec![1, 3333, 100000000u32],
+                vec![1, 2, 100000000u32],
+                vec![1, 2, 100000000u32],
+            ],
+            2,
+        );
+        aux_test_conjunction(
+            vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
+            2,
+        );
+        aux_test_conjunction(
+            vec![
+                vec![1, 3333, 100000000u32],
+                vec![1, 2, 100000000u32],
+                vec![1, 2, 100000000u32],
+            ],
+            3,
+        )
+    }
+
+    // This dummy scorer does nothing but yield doc id increasingly.
+    // with constant score 1.0
+    #[derive(Clone)]
+    struct DummyScorer {
+        cursor: usize,
+        foo: Vec<(DocId, f32)>,
+    }
+
+    impl DummyScorer {
+        fn new(doc_score: Vec<(DocId, f32)>) -> Self {
+            Self {
+                cursor: 0,
+                foo: doc_score,
+            }
+        }
+    }
+
+    impl DocSet for DummyScorer {
+        fn advance(&mut self) -> DocId {
+            self.cursor += 1;
+            self.doc()
+        }
+
+        fn doc(&self) -> DocId {
+            self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
+        }
+
+        fn size_hint(&self) -> u32 {
+            self.foo.len() as u32
+        }
+    }
+
+    impl Scorer for DummyScorer {
+        fn score(&mut self) -> Score {
+            self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
+        }
+    }
+
+    #[test]
+    fn test_score_calculate() {
+        let mut scorer = Disjunction::new(
+            vec![
+                DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
+            ],
+            SumCombiner::default(),
+            3,
+        );
+        assert_eq!(scorer.score(), 5.0);
+        assert_eq!(scorer.advance(), 2);
+        assert_eq!(scorer.score(), 3.0);
+    }
+
+    #[test]
+    fn test_score_calculate_corner_case() {
+        let mut scorer = Disjunction::new(
+            vec![
+                DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
+                DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
+            ],
+            SumCombiner::default(),
+            2,
+        );
+        assert_eq!(scorer.doc(), 1);
+        assert_eq!(scorer.score(), 3.0);
+        assert_eq!(scorer.advance(), 3);
+        assert_eq!(scorer.score(), 2.0);
+    }
+}
--- a/src/query/exist_query.rs
+++ b/src/query/exist_query.rs
@@ -149,7 +149,7 @@ mod tests {
    use crate::query::exist_query::ExistsQuery;
    use crate::query::{BooleanQuery, RangeQuery};
    use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
-    use crate::{Index, Searcher};
+    use crate::{Index, Searcher, Term};

    #[test]
    fn test_exists_query_simple() -> crate::Result<()> {
@@ -188,9 +188,8 @@ mod tests {

        // exercise seek
        let query = BooleanQuery::intersection(vec![
-            Box::new(RangeQuery::new_u64_bounds(
-                "all".to_string(),
-                Bound::Included(50),
+            Box::new(RangeQuery::new(
+                Bound::Included(Term::from_field_u64(all_field, 50)),
                Bound::Unbounded,
            )),
            Box::new(ExistsQuery::new_exists_query("even".to_string())),
@@ -198,10 +197,9 @@ mod tests {
        assert_eq!(searcher.search(&query, &Count)?, 25);

        let query = BooleanQuery::intersection(vec![
-            Box::new(RangeQuery::new_u64_bounds(
-                "all".to_string(),
-                Bound::Included(0),
-                Bound::Excluded(50),
+            Box::new(RangeQuery::new(
+                Bound::Included(Term::from_field_u64(all_field, 0)),
+                Bound::Included(Term::from_field_u64(all_field, 50)),
            )),
            Box::new(ExistsQuery::new_exists_query("odd".to_string())),
        ]);
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -5,6 +5,7 @@ mod bm25;
 mod boolean_query;
 mod boost_query;
 mod const_score_query;
+mod disjunction;
 mod disjunction_max_query;
 mod empty_query;
 mod exclude;
@@ -53,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
 pub use self::phrase_query::PhraseQuery;
 pub use self::query::{EnableScoring, Query, QueryClone};
 pub use self::query_parser::{QueryParser, QueryParserError};
-pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
+pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
 pub use self::regex_query::RegexQuery;
 pub use self::reqopt_scorer::RequiredOptionalScorer;
 pub use self::score_combiner::{
--- a/src/query/phrase_prefix_query/phrase_prefix_query.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs
@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
                    Bound::Unbounded
                };

-            let mut range_query = RangeQuery::new_term_bounds(
-                enable_scoring
-                    .schema()
-                    .get_field_name(self.field)
-                    .to_owned(),
-                self.prefix.1.typ(),
-                &Bound::Included(self.prefix.1.clone()),
-                &end_term,
-            );
+            let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
            range_query.limit(self.max_expansions as u64);
            range_query.weight(enable_scoring)
        }
--- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
@@ -97,6 +97,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
    suffixes: Vec<TPostings>,
    suffix_offset: u32,
    phrase_count: u32,
+    suffix_position_buffer: Vec<u32>,
 }

 impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
@@ -140,6 +141,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            suffixes,
            suffix_offset: (max_offset - suffix_pos) as u32,
            phrase_count: 0,
+            suffix_position_buffer: Vec::with_capacity(100),
        };
        if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
            phrase_prefix_scorer.advance();
@@ -153,7 +155,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {

    fn matches_prefix(&mut self) -> bool {
        let mut count = 0;
-        let mut positions = Vec::new();
        let current_doc = self.doc();
        let pos_matching = self.phrase_scorer.get_intersection();
        for suffix in &mut self.suffixes {
@@ -162,8 +163,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            }
            let doc = suffix.seek(current_doc);
            if doc == current_doc {
-                suffix.positions_with_offset(self.suffix_offset, &mut positions);
-                count += intersection_count(pos_matching, &positions);
+                suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
+                count += intersection_count(pos_matching, &self.suffix_position_buffer);
            }
        }
        self.phrase_count = count as u32;
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -2,7 +2,7 @@ use std::fmt;
 use std::ops::Bound;

 use crate::query::Occur;
-use crate::schema::{Field, Term, Type};
+use crate::schema::Term;
 use crate::Score;

 #[derive(Clone)]
@@ -14,14 +14,10 @@ pub enum LogicalLiteral {
        prefix: bool,
    },
    Range {
-        field: String,
-        value_type: Type,
        lower: Bound<Term>,
        upper: Bound<Term>,
    },
    Set {
-        field: Field,
-        value_type: Type,
        elements: Vec<Term>,
    },
    All,
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -790,8 +790,6 @@ impl QueryParser {
                let (field, json_path) = try_tuple!(self
                    .split_full_path(&full_path)
                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
-                let field_entry = self.schema.get_field_entry(field);
-                let value_type = field_entry.field_type().value_type();
                let mut errors = Vec::new();
                let lower = match self.resolve_bound(field, json_path, &lower) {
                    Ok(bound) => bound,
@@ -812,12 +810,8 @@ impl QueryParser {
                    // we failed to parse something. Either way, there is no point emiting it
                    return (None, errors);
                }
-                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
-                    field: self.schema.get_field_name(field).to_string(),
-                    value_type,
-                    lower,
-                    upper,
-                }));
+                let logical_ast =
+                    LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
                (Some(logical_ast), errors)
            }
            UserInputLeaf::Set {
@@ -832,17 +826,11 @@ impl QueryParser {
                let (field, json_path) = try_tuple!(self
                    .split_full_path(&full_path)
                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
-                let field_entry = self.schema.get_field_entry(field);
-                let value_type = field_entry.field_type().value_type();
                let (elements, errors) = elements
                    .into_iter()
                    .map(|element| self.compute_boundary_term(field, json_path, &element))
                    .partition_result();
-                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
-                    elements,
-                    field,
-                    value_type,
-                }));
+                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
                (Some(logical_ast), errors)
            }
            UserInputLeaf::Exists { .. } => (
@@ -890,14 +878,7 @@ fn convert_literal_to_query(
                Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
            }
        }
-        LogicalLiteral::Range {
-            field,
-            value_type,
-            lower,
-            upper,
-        } => Box::new(RangeQuery::new_term_bounds(
-            field, value_type, &lower, &upper,
-        )),
+        LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
        LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
        LogicalLiteral::All => Box::new(AllQuery),
    }
@@ -1142,8 +1123,8 @@ mod test {
        let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
        assert_eq!(
            format!("{query:?}"),
-            "RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
-             upper_bound: Included([98]), limit: None }"
+            "RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
+             Included(Term(field=0, type=Str, \"b\")), limit: None }"
        );
    }

@@ -1821,7 +1802,8 @@ mod test {
             \"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
             (Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
             type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
-             Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
+             Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
+             minimum_number_should_match: 1 }"
        );
    }

@@ -1886,7 +1868,8 @@ mod test {
                format!("{query:?}"),
                "BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
                 type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
-                 (Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
+                 (Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
+                 minimum_number_should_match: 1 }"
            );
        }

@@ -1903,7 +1886,8 @@ mod test {
                format!("{query:?}"),
                "BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
                 \"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
-                 distance: 2, transposition_cost_one: false, prefix: true })] }"
+                 distance: 2, transposition_cost_one: false, prefix: true })], \
+                 minimum_number_should_match: 1 }"
            );
        }
    }
--- a/src/query/range_query/fast_field_range_doc_set.rs
+++ b/src/query/range_query/fast_field_range_doc_set.rs
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe

 #[cfg(test)]
 mod tests {
+    use std::ops::Bound;
+
    use crate::collector::Count;
    use crate::directory::RamDirectory;
    use crate::query::RangeQuery;
-    use crate::{schema, IndexBuilder, TantivyDocument};
+    use crate::{schema, IndexBuilder, TantivyDocument, Term};

    #[test]
    fn range_query_fast_optional_field_minimum() {
@@ -218,10 +220,9 @@ mod tests {
        let reader = index.reader().unwrap();
        let searcher = reader.searcher();

-        let query = RangeQuery::new_u64_bounds(
-            "score".to_string(),
-            std::ops::Bound::Included(70),
-            std::ops::Bound::Unbounded,
+        let query = RangeQuery::new(
+            Bound::Included(Term::from_field_u64(score_field, 70)),
+            Bound::Unbounded,
        );

        let count = searcher.search(&query, &Count).unwrap();
--- a/src/query/range_query/mod.rs
+++ b/src/query/range_query/mod.rs
@@ -2,21 +2,19 @@ use std::ops::Bound;

 use crate::schema::Type;

-mod fast_field_range_query;
+mod fast_field_range_doc_set;
 mod range_query;
-mod range_query_ip_fastfield;
 mod range_query_u64_fastfield;

 pub use self::range_query::RangeQuery;
-pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
 pub use self::range_query_u64_fastfield::FastFieldRangeWeight;

 // TODO is this correct?
 pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
    match typ {
-        Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
+        Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
        Type::IpAddr => true,
-        Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
+        Type::Facet | Type::Bytes | Type::Json => false,
    }
 }

--- a/src/query/range_query/range_query.rs
+++ b/src/query/range_query/range_query.rs
@@ -1,21 +1,17 @@
 use std::io;
-use std::net::Ipv6Addr;
-use std::ops::{Bound, Range};
+use std::ops::Bound;

-use columnar::MonotonicallyMappableToU128;
-use common::{BinarySerializable, BitSet};
+use common::BitSet;

 use super::map_bound;
 use super::range_query_u64_fastfield::FastFieldRangeWeight;
-use crate::error::TantivyError;
 use crate::index::SegmentReader;
 use crate::query::explanation::does_not_match;
-use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
-use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
+use crate::query::range_query::is_type_valid_for_fastfield_range_query;
 use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
 use crate::schema::{Field, IndexRecordOption, Term, Type};
 use crate::termdict::{TermDictionary, TermStreamer};
-use crate::{DateTime, DocId, Score};
+use crate::{DocId, Score};

 /// `RangeQuery` matches all documents that have at least one term within a defined range.
 ///
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
 /// ```rust
 /// use tantivy::collector::Count;
 /// use tantivy::query::RangeQuery;
+/// use tantivy::Term;
 /// use tantivy::schema::{Schema, INDEXED};
 /// use tantivy::{doc, Index, IndexWriter};
+/// use std::ops::Bound;
 /// # fn test() -> tantivy::Result<()> {
 /// let mut schema_builder = Schema::builder();
 /// let year_field = schema_builder.add_u64_field("year", INDEXED);
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
 ///
 /// let reader = index.reader()?;
 /// let searcher = reader.searcher();
-/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
+/// let docs_in_the_sixties = RangeQuery::new(
+///     Bound::Included(Term::from_field_u64(year_field, 1960)),
+///     Bound::Excluded(Term::from_field_u64(year_field, 1970)),
+/// );
 /// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
 /// assert_eq!(num_60s_books, 2285);
 /// Ok(())
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
 /// ```
 #[derive(Clone, Debug)]
 pub struct RangeQuery {
-    field: String,
-    value_type: Type,
-    lower_bound: Bound<Vec<u8>>,
-    upper_bound: Bound<Vec<u8>>,
+    lower_bound: Bound<Term>,
+    upper_bound: Bound<Term>,
    limit: Option<u64>,
 }

+/// Returns the inner value of a `Bound`
+pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
+    match val {
+        Bound::Included(term) | Bound::Excluded(term) => Some(term),
+        Bound::Unbounded => None,
+    }
+}
+
 impl RangeQuery {
    /// Creates a new `RangeQuery` from bounded start and end terms.
    ///
    /// If the value type is not correct, something may go terribly wrong when
    /// the `Weight` object is created.
-    pub fn new_term_bounds(
-        field: String,
-        value_type: Type,
-        lower_bound: &Bound<Term>,
-        upper_bound: &Bound<Term>,
-    ) -> RangeQuery {
-        let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
+    pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
        RangeQuery {
-            field,
-            value_type,
-            lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
-            upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
+            lower_bound,
+            upper_bound,
            limit: None,
        }
    }

-    /// Creates a new `RangeQuery` over a `i64` field.
-    ///
-    /// If the field is not of the type `i64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
-        RangeQuery::new_i64_bounds(
-            field,
-            Bound::Included(range.start),
-            Bound::Excluded(range.end),
-        )
-    }
-
-    /// Create a new `RangeQuery` over a `i64` field.
-    ///
-    /// The two `Bound` arguments make it possible to create more complex
-    /// ranges than semi-inclusive range.
-    ///
-    /// If the field is not of the type `i64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_i64_bounds(
-        field: String,
-        lower_bound: Bound<i64>,
-        upper_bound: Bound<i64>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &i64| {
-            Term::from_field_i64(Field::from_field_id(0), *val)
-                .serialized_value_bytes()
-                .to_owned()
-        };
-        RangeQuery {
-            field,
-            value_type: Type::I64,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Creates a new `RangeQuery` over a `f64` field.
-    ///
-    /// If the field is not of the type `f64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
-        RangeQuery::new_f64_bounds(
-            field,
-            Bound::Included(range.start),
-            Bound::Excluded(range.end),
-        )
-    }
-
-    /// Create a new `RangeQuery` over a `f64` field.
-    ///
-    /// The two `Bound` arguments make it possible to create more complex
-    /// ranges than semi-inclusive range.
-    ///
-    /// If the field is not of the type `f64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_f64_bounds(
-        field: String,
-        lower_bound: Bound<f64>,
-        upper_bound: Bound<f64>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &f64| {
-            Term::from_field_f64(Field::from_field_id(0), *val)
-                .serialized_value_bytes()
-                .to_owned()
-        };
-        RangeQuery {
-            field,
-            value_type: Type::F64,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Create a new `RangeQuery` over a `u64` field.
-    ///
-    /// The two `Bound` arguments make it possible to create more complex
-    /// ranges than semi-inclusive range.
-    ///
-    /// If the field is not of the type `u64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_u64_bounds(
-        field: String,
-        lower_bound: Bound<u64>,
-        upper_bound: Bound<u64>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &u64| {
-            Term::from_field_u64(Field::from_field_id(0), *val)
-                .serialized_value_bytes()
-                .to_owned()
-        };
-        RangeQuery {
-            field,
-            value_type: Type::U64,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Create a new `RangeQuery` over a `ip` field.
-    ///
-    /// If the field is not of the type `ip`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_ip_bounds(
-        field: String,
-        lower_bound: Bound<Ipv6Addr>,
-        upper_bound: Bound<Ipv6Addr>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &Ipv6Addr| {
-            Term::from_field_ip_addr(Field::from_field_id(0), *val)
-                .serialized_value_bytes()
-                .to_owned()
-        };
-        RangeQuery {
-            field,
-            value_type: Type::IpAddr,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Create a new `RangeQuery` over a `u64` field.
-    ///
-    /// If the field is not of the type `u64`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
-        RangeQuery::new_u64_bounds(
-            field,
-            Bound::Included(range.start),
-            Bound::Excluded(range.end),
-        )
-    }
-
-    /// Create a new `RangeQuery` over a `date` field.
-    ///
-    /// The two `Bound` arguments make it possible to create more complex
-    /// ranges than semi-inclusive range.
-    ///
-    /// If the field is not of the type `date`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_date_bounds(
-        field: String,
-        lower_bound: Bound<DateTime>,
-        upper_bound: Bound<DateTime>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &DateTime| {
-            Term::from_field_date(Field::from_field_id(0), *val)
-                .serialized_value_bytes()
-                .to_owned()
-        };
-        RangeQuery {
-            field,
-            value_type: Type::Date,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Create a new `RangeQuery` over a `date` field.
-    ///
-    /// If the field is not of the type `date`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
-        RangeQuery::new_date_bounds(
-            field,
-            Bound::Included(range.start),
-            Bound::Excluded(range.end),
-        )
-    }
-
-    /// Create a new `RangeQuery` over a `Str` field.
-    ///
-    /// The two `Bound` arguments make it possible to create more complex
-    /// ranges than semi-inclusive range.
-    ///
-    /// If the field is not of the type `Str`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_str_bounds(
-        field: String,
-        lower_bound: Bound<&str>,
-        upper_bound: Bound<&str>,
-    ) -> RangeQuery {
-        let make_term_val = |val: &&str| val.as_bytes().to_vec();
-        RangeQuery {
-            field,
-            value_type: Type::Str,
-            lower_bound: map_bound(&lower_bound, make_term_val),
-            upper_bound: map_bound(&upper_bound, make_term_val),
-            limit: None,
-        }
-    }
-
-    /// Create a new `RangeQuery` over a `Str` field.
-    ///
-    /// If the field is not of the type `Str`, tantivy
-    /// will panic when the `Weight` object is created.
-    pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
-        RangeQuery::new_str_bounds(
-            field,
-            Bound::Included(range.start),
-            Bound::Excluded(range.end),
-        )
-    }
-
    /// Field to search over
-    pub fn field(&self) -> &str {
-        &self.field
+    pub fn field(&self) -> Field {
+        self.get_term().field()
+    }
+
+    /// The value type of the field
+    pub fn value_type(&self) -> Type {
+        self.get_term().typ()
+    }
+
+    pub(crate) fn get_term(&self) -> &Term {
+        inner_bound(&self.lower_bound)
+            .or(inner_bound(&self.upper_bound))
+            .expect("At least one bound must be set")
    }

    /// Limit the number of term the `RangeQuery` will go through.
@@ -319,70 +120,23 @@ impl RangeQuery {
    }
 }

-/// Returns true if the type maps to a u64 fast field
-pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
-    match typ {
-        Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
-        Type::IpAddr => false,
-        Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
-    }
-}
-
 impl Query for RangeQuery {
    fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
        let schema = enable_scoring.schema();
-        let field_type = schema
-            .get_field_entry(schema.get_field(&self.field)?)
-            .field_type();
-        let value_type = field_type.value_type();
-        if value_type != self.value_type {
-            let err_msg = format!(
-                "Create a range query of the type {:?}, when the field given was of type \
-                 {value_type:?}",
-                self.value_type
-            );
-            return Err(TantivyError::SchemaError(err_msg));
-        }
+        let field_type = schema.get_field_entry(self.field()).field_type();

-        if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
-            if field_type.is_ip_addr() {
-                let parse_ip_from_bytes = |data: &Vec<u8>| {
-                    let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
-                        crate::TantivyError::InvalidArgument(
-                            "Expected 8 bytes for ip address".to_string(),
-                        )
-                    })?;
-                    let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
-                    crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
-                };
-                let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
-                let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
-                Ok(Box::new(IPFastFieldRangeWeight::new(
-                    self.field.to_string(),
-                    lower_bound,
-                    upper_bound,
-                )))
-            } else {
-                // We run the range query on u64 value space for performance reasons and simpicity
-                // assert the type maps to u64
-                assert!(maps_to_u64_fastfield(self.value_type));
-                let parse_from_bytes = |data: &Vec<u8>| {
-                    u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
-                };
-
-                let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
-                let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
-                Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
-                    self.field.to_string(),
-                    lower_bound,
-                    upper_bound,
-                )))
-            }
+        if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
+            Ok(Box::new(FastFieldRangeWeight::new(
+                self.field(),
+                self.lower_bound.clone(),
+                self.upper_bound.clone(),
+            )))
        } else {
+            let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
            Ok(Box::new(RangeWeight {
-                field: self.field.to_string(),
-                lower_bound: self.lower_bound.clone(),
-                upper_bound: self.upper_bound.clone(),
+                field: self.field(),
+                lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
+                upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
                limit: self.limit,
            }))
        }
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
 }

 pub struct RangeWeight {
-    field: String,
+    field: Field,
    lower_bound: Bound<Vec<u8>>,
    upper_bound: Bound<Vec<u8>>,
    limit: Option<u64>,
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
        let max_doc = reader.max_doc();
        let mut doc_bitset = BitSet::with_max_value(max_doc);

-        let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
+        let inverted_index = reader.inverted_index(self.field)?;
        let term_dict = inverted_index.terms();
        let mut term_range = self.term_range(term_dict)?;
        let mut processed_count = 0;
@@ -477,7 +231,7 @@ mod tests {
    use crate::schema::{
        Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
    };
-    use crate::{Index, IndexWriter};
+    use crate::{Index, IndexWriter, Term};

    #[test]
    fn test_range_query_simple() -> crate::Result<()> {
@@ -499,7 +253,10 @@ mod tests {
        let reader = index.reader()?;
        let searcher = reader.searcher();

-        let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
+        let docs_in_the_sixties = RangeQuery::new(
+            Bound::Included(Term::from_field_u64(year_field, 1960)),
+            Bound::Excluded(Term::from_field_u64(year_field, 1970)),
+        );

        // ... or `1960..=1969` if inclusive range is enabled.
        let count = searcher.search(&docs_in_the_sixties, &Count)?;
@@ -530,7 +287,10 @@ mod tests {
        let reader = index.reader()?;
        let searcher = reader.searcher();

-        let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
+        let mut docs_in_the_sixties = RangeQuery::new(
+            Bound::Included(Term::from_field_u64(year_field, 1960)),
+            Bound::Excluded(Term::from_field_u64(year_field, 1970)),
+        );
        docs_in_the_sixties.limit(5);

        // due to the limit and no docs in 1963, it's really only 1960..=1965
@@ -575,29 +335,29 @@ mod tests {
            |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();

        assert_eq!(
-            count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_i64(int_field, 10)),
+                Bound::Excluded(Term::from_field_i64(int_field, 11)),
+            )),
            9
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_i64_bounds(
-                "intfield".to_string(),
-                Bound::Included(10),
-                Bound::Included(11)
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_i64(int_field, 10)),
+                Bound::Included(Term::from_field_i64(int_field, 11)),
            )),
            18
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_i64_bounds(
-                "intfield".to_string(),
-                Bound::Excluded(9),
-                Bound::Included(10)
+            count_multiples(RangeQuery::new(
+                Bound::Excluded(Term::from_field_i64(int_field, 9)),
+                Bound::Included(Term::from_field_i64(int_field, 10)),
            )),
            9
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_i64_bounds(
-                "intfield".to_string(),
-                Bound::Included(9),
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_i64(int_field, 9)),
                Bound::Unbounded
            )),
            91
@@ -646,29 +406,29 @@ mod tests {
            |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();

        assert_eq!(
-            count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_f64(float_field, 10.0)),
+                Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
+            )),
            9
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_f64_bounds(
-                "floatfield".to_string(),
-                Bound::Included(10.0),
-                Bound::Included(11.0)
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_f64(float_field, 10.0)),
+                Bound::Included(Term::from_field_f64(float_field, 11.0)),
            )),
            18
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_f64_bounds(
-                "floatfield".to_string(),
-                Bound::Excluded(9.0),
-                Bound::Included(10.0)
+            count_multiples(RangeQuery::new(
+                Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
+                Bound::Included(Term::from_field_f64(float_field, 10.0)),
            )),
            9
        );
        assert_eq!(
-            count_multiples(RangeQuery::new_f64_bounds(
-                "floatfield".to_string(),
-                Bound::Included(9.0),
+            count_multiples(RangeQuery::new(
+                Bound::Included(Term::from_field_f64(float_field, 9.0)),
                Bound::Unbounded
            )),
            91
--- a/src/query/range_query/range_query_ip_fastfield.rs
+++ b/src/query/range_query/range_query_ip_fastfield.rs
@@ -1,512 +0,0 @@
-//! IP Fastfields support efficient scanning for range queries.
-//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
-//! used, which uses the term dictionary + postings.
-
-use std::net::Ipv6Addr;
-use std::ops::{Bound, RangeInclusive};
-
-use columnar::{Column, MonotonicallyMappableToU128};
-
-use crate::query::range_query::fast_field_range_query::RangeDocSet;
-use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
-use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
-
-/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
-pub struct IPFastFieldRangeWeight {
-    field: String,
-    lower_bound: Bound<Ipv6Addr>,
-    upper_bound: Bound<Ipv6Addr>,
-}
-
-impl IPFastFieldRangeWeight {
-    /// Creates a new IPFastFieldRangeWeight.
-    pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
-        Self {
-            field,
-            lower_bound,
-            upper_bound,
-        }
-    }
-}
-
-impl Weight for IPFastFieldRangeWeight {
-    fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
-        let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
-            reader.fast_fields().column_opt(&self.field)?
-        else {
-            return Ok(Box::new(EmptyScorer));
-        };
-        let value_range = bound_to_value_range(
-            &self.lower_bound,
-            &self.upper_bound,
-            ip_addr_column.min_value(),
-            ip_addr_column.max_value(),
-        );
-        let docset = RangeDocSet::new(value_range, ip_addr_column);
-        Ok(Box::new(ConstScorer::new(docset, boost)))
-    }
-
-    fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
-        let mut scorer = self.scorer(reader, 1.0)?;
-        if scorer.seek(doc) != doc {
-            return Err(TantivyError::InvalidArgument(format!(
-                "Document #({doc}) does not match"
-            )));
-        }
-        let explanation = Explanation::new("Const", scorer.score());
-        Ok(explanation)
-    }
-}
-
-fn bound_to_value_range(
-    lower_bound: &Bound<Ipv6Addr>,
-    upper_bound: &Bound<Ipv6Addr>,
-    min_value: Ipv6Addr,
-    max_value: Ipv6Addr,
-) -> RangeInclusive<Ipv6Addr> {
-    let start_value = match lower_bound {
-        Bound::Included(ip_addr) => *ip_addr,
-        Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
-        Bound::Unbounded => min_value,
-    };
-
-    let end_value = match upper_bound {
-        Bound::Included(ip_addr) => *ip_addr,
-        Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
-        Bound::Unbounded => max_value,
-    };
-    start_value..=end_value
-}
-
-#[cfg(test)]
-pub mod tests {
-    use proptest::prelude::ProptestConfig;
-    use proptest::strategy::Strategy;
-    use proptest::{prop_oneof, proptest};
-
-    use super::*;
-    use crate::collector::Count;
-    use crate::query::QueryParser;
-    use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
-    use crate::{Index, IndexWriter};
-
-    #[derive(Clone, Debug)]
-    pub struct Doc {
-        pub id: String,
-        pub ip: Ipv6Addr,
-    }
-
-    fn operation_strategy() -> impl Strategy<Value = Doc> {
-        prop_oneof![
-            (0u64..10_000u64).prop_map(doc_from_id_1),
-            (1u64..10_000u64).prop_map(doc_from_id_2),
-        ]
-    }
-
-    pub fn doc_from_id_1(id: u64) -> Doc {
-        let id = id * 1000;
-        Doc {
-            // ip != id
-            id: id.to_string(),
-            ip: Ipv6Addr::from_u128(id as u128),
-        }
-    }
-    fn doc_from_id_2(id: u64) -> Doc {
-        let id = id * 1000;
-        Doc {
-            // ip != id
-            id: (id - 1).to_string(),
-            ip: Ipv6Addr::from_u128(id as u128),
-        }
-    }
-
-    proptest! {
-        #![proptest_config(ProptestConfig::with_cases(10))]
-        #[test]
-        fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
-            assert!(test_ip_range_for_docs(&ops).is_ok());
-        }
-    }
-
-    #[test]
-    fn test_ip_range_regression1() {
-        let ops = &[doc_from_id_1(0)];
-        assert!(test_ip_range_for_docs(ops).is_ok());
-    }
-
-    #[test]
-    fn test_ip_range_regression2() {
-        let ops = &[
-            doc_from_id_1(52),
-            doc_from_id_1(63),
-            doc_from_id_1(12),
-            doc_from_id_2(91),
-            doc_from_id_2(33),
-        ];
-        assert!(test_ip_range_for_docs(ops).is_ok());
-    }
-
-    #[test]
-    fn test_ip_range_regression3() {
-        let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
-        assert!(test_ip_range_for_docs(ops).is_ok());
-    }
-
-    #[test]
-    fn test_ip_range_regression3_simple() {
-        let mut schema_builder = Schema::builder();
-        let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-        let mut writer: IndexWriter = index.writer_for_tests().unwrap();
-        let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
-            .into_iter()
-            .map(Ipv6Addr::from_u128)
-            .collect();
-        for &ip_addr in &ip_addrs {
-            writer
-                .add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
-                .unwrap();
-        }
-        writer.commit().unwrap();
-        let searcher = index.reader().unwrap().searcher();
-        let range_weight = IPFastFieldRangeWeight {
-            field: "ips".to_string(),
-            lower_bound: Bound::Included(ip_addrs[1]),
-            upper_bound: Bound::Included(ip_addrs[2]),
-        };
-        let count = range_weight.count(searcher.segment_reader(0)).unwrap();
-        assert_eq!(count, 2);
-    }
-
-    pub fn create_index_from_docs(docs: &[Doc]) -> Index {
-        let mut schema_builder = Schema::builder();
-        let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
-        let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
-        let text_field = schema_builder.add_text_field("id", STRING | STORED);
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-
-        {
-            let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
-            for doc in docs.iter() {
-                index_writer
-                    .add_document(doc!(
-                        ips_field => doc.ip,
-                        ips_field => doc.ip,
-                        ip_field => doc.ip,
-                        text_field => doc.id.to_string(),
-                    ))
-                    .unwrap();
-            }
-
-            index_writer.commit().unwrap();
-        }
-        index
-    }
-
-    fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
-        let index = create_index_from_docs(docs);
-        let reader = index.reader().unwrap();
-        let searcher = reader.searcher();
-
-        let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
-        let query_from_text = |text: &str| {
-            QueryParser::for_index(&index, vec![])
-                .parse_query(text)
-                .unwrap()
-        };
-
-        let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
-            format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
-        };
-
-        let test_sample = |sample_docs: &[Doc]| {
-            let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
-            ips.sort();
-            let ip_range = ips[0]..=ips[1];
-            let expected_num_hits = docs
-                .iter()
-                .filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
-                .count();
-
-            let query = gen_query_inclusive("ip", &ip_range);
-            assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
-
-            let query = gen_query_inclusive("ips", &ip_range);
-            assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
-
-            // Intersection search
-            let id_filter = sample_docs[0].id.to_string();
-            let expected_num_hits = docs
-                .iter()
-                .filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
-                .count();
-            let query = format!(
-                "{} AND id:{}",
-                gen_query_inclusive("ip", &ip_range),
-                &id_filter
-            );
-            assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
-
-            // Intersection search on multivalue ip field
-            let id_filter = sample_docs[0].id.to_string();
-            let query = format!(
-                "{} AND id:{}",
-                gen_query_inclusive("ips", &ip_range),
-                &id_filter
-            );
-            assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
-        };
-
-        test_sample(&[docs[0].clone(), docs[0].clone()]);
-        if docs.len() > 1 {
-            test_sample(&[docs[0].clone(), docs[1].clone()]);
-            test_sample(&[docs[1].clone(), docs[1].clone()]);
-        }
-        if docs.len() > 2 {
-            test_sample(&[docs[1].clone(), docs[2].clone()]);
-        }
-
-        Ok(())
-    }
-}
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-
-    use rand::rngs::StdRng;
-    use rand::{Rng, SeedableRng};
-    use test::Bencher;
-
-    use super::tests::*;
-    use super::*;
-    use crate::collector::Count;
-    use crate::query::QueryParser;
-    use crate::Index;
-
-    fn get_index_0_to_100() -> Index {
-        let mut rng = StdRng::from_seed([1u8; 32]);
-        let num_vals = 100_000;
-        let docs: Vec<_> = (0..num_vals)
-            .map(|_i| {
-                let id = if rng.gen_bool(0.01) {
-                    "veryfew".to_string() // 1%
-                } else if rng.gen_bool(0.1) {
-                    "few".to_string() // 9%
-                } else {
-                    "many".to_string() // 90%
-                };
-                Doc {
-                    id,
-                    // Multiply by 1000, so that we create many buckets in the compact space
-                    // The benches depend on this range to select n-percent of elements with the
-                    // methods below.
-                    ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
-                }
-            })
-            .collect();
-
-        create_index_from_docs(&docs)
-    }
-
-    fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
-        let start = Ipv6Addr::from_u128(0);
-        let end = Ipv6Addr::from_u128(90 * 1000);
-        start..=end
-    }
-
-    fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
-        let start = Ipv6Addr::from_u128(0);
-        let end = Ipv6Addr::from_u128(10 * 1000);
-        start..=end
-    }
-
-    fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
-        let start = Ipv6Addr::from_u128(10 * 1000);
-        let end = Ipv6Addr::from_u128(10 * 1000);
-        start..=end
-    }
-
-    fn excute_query(
-        field: &str,
-        ip_range: RangeInclusive<Ipv6Addr>,
-        suffix: &str,
-        index: &Index,
-    ) -> usize {
-        let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
-            format!(
-                "{}:[{} TO {}] {}",
-                field,
-                &from.to_string(),
-                &to.to_string(),
-                suffix
-            )
-        };
-
-        let query = gen_query_inclusive(ip_range.start(), ip_range.end());
-        let query_from_text = |text: &str| {
-            QueryParser::for_index(index, vec![])
-                .parse_query(text)
-                .unwrap()
-        };
-        let query = query_from_text(&query);
-        let reader = index.reader().unwrap();
-        let searcher = reader.searcher();
-        searcher.search(&query, &(Count)).unwrap()
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
-    }
-
-    #[bench]
-    fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
-        let index = get_index_0_to_100();
-
-        bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
-    }
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Pascal Seitz	b345c11786	add key_as_string for numbers in term agg	2024-07-25 13:20:56 +08:00
PSeitz	7ebcc15b17	add support for str fast field range query (#2453 ) * add support for str fast field range query Add support for range queries on fast fields, by converting term bounds to term ordinals bounds. closes https://github.com/quickwit-oss/tantivy/issues/2023 * extend tests, rename * update comment * update comment	2024-07-17 09:31:42 +08:00
PSeitz	1b4076691f	refactor fast field query (#2452 ) As preparation of #2023 and #1709 * Use Term to pass parameters * merge u64 and ip fast field range query Side note: I did not rename range_query_u64_fastfield, because then git can't track the changes.	2024-07-15 18:08:05 +08:00
Robert Caulk	eab660873a	doc: fix typo in readme (#2450 )	2024-07-09 15:12:22 +08:00
PSeitz	232f37126e	fix coverage (#2448 )	2024-07-05 12:04:18 +08:00
PSeitz	13e9885dfd	faster term aggregation fetch terms (#2447 ) big impact for term aggregations with large `size` parameter (e.g. 1000) add top 1000 term agg bench full terms_few Memory: 27.3 KB (+79.09%) Avg: 3.8058ms (+2.40%) Median: 3.7192ms (+3.47%) [3.6224ms .. 4.3721ms] terms_many Memory: 6.9 MB Avg: 12.6102ms (-4.70%) Median: 12.1389ms (-6.58%) [10.2847ms .. 15.4857ms] terms_many_top_1000 Memory: 6.9 MB Avg: 15.8216ms (-83.19%) Median: 15.4899ms (-83.46%) [13.4250ms .. 20.6897ms] terms_many_order_by_term Memory: 6.9 MB Avg: 14.7820ms (-3.95%) Median: 14.2236ms (-4.28%) [12.6669ms .. 21.0968ms] terms_many_with_top_hits Memory: 58.2 MB Avg: 551.6218ms (+7.18%) Median: 549.8826ms (+11.01%) [496.7371ms .. 592.1299ms] terms_many_with_avg_sub_agg Memory: 27.8 MB Avg: 197.7029ms (+2.66%) Median: 190.1564ms (+0.64%) [167.9226ms .. 245.6651ms] terms_many_json_mixed_type_with_avg_sub_agg Memory: 42.0 MB (+0.00%) Avg: 242.0121ms (+0.92%) Median: 237.7084ms (-2.85%) [201.9959ms .. 302.2136ms] terms_few_with_cardinality_agg Memory: 10.6 MB Avg: 122.6036ms (+1.21%) Median: 119.0033ms (+2.60%) [109.2859ms .. 161.5858ms] range_agg_with_term_agg_few Memory: 45.4 KB (+39.75%) Avg: 24.5454ms (+2.14%) Median: 24.2861ms (+2.44%) [23.5109ms .. 27.8406ms] range_agg_with_term_agg_many Memory: 6.9 MB Avg: 56.8049ms (+3.01%) Median: 50.9706ms (+1.52%) [41.4517ms .. 90.3934ms] dense terms_few Memory: 28.8 KB (+81.74%) Avg: 8.9092ms (-2.24%) Median: 8.7143ms (-1.31%) [8.6148ms .. 10.3868ms] terms_many Memory: 6.9 MB (-0.00%) Avg: 17.9604ms (-10.18%) Median: 17.1552ms (-11.93%) [14.8979ms .. 26.2779ms] terms_many_top_1000 Memory: 6.9 MB Avg: 21.4963ms (-78.90%) Median: 21.2924ms (-78.98%) [18.2033ms .. 28.0087ms] terms_many_order_by_term Memory: 6.9 MB Avg: 20.4167ms (-9.13%) Median: 19.5596ms (-11.37%) [17.5153ms .. 29.5987ms] terms_many_with_top_hits Memory: 58.2 MB Avg: 518.4474ms (-6.41%) Median: 514.9180ms (-9.44%) [471.5550ms .. 579.0220ms] terms_many_with_avg_sub_agg Memory: 27.8 MB Avg: 263.6702ms (-2.78%) Median: 260.8775ms (-2.55%) [239.5754ms .. 304.6669ms] terms_many_json_mixed_type_with_avg_sub_agg Memory: 42.0 MB Avg: 299.9791ms (-2.01%) Median: 302.2180ms (-3.08%) [239.2080ms .. 346.3649ms] terms_few_with_cardinality_agg Memory: 10.6 MB Avg: 136.3303ms (-3.12%) Median: 132.3831ms (-2.88%) [123.7564ms .. 164.7914ms] range_agg_with_term_agg_few Memory: 47.1 KB (+37.81%) Avg: 35.4538ms (+0.66%) Median: 34.8754ms (-0.56%) [34.2287ms .. 40.0884ms] range_agg_with_term_agg_many Memory: 6.9 MB Avg: 72.2269ms (-4.38%) Median: 66.1174ms (-4.98%) [55.5125ms .. 124.1622ms] sparse terms_few Memory: 27.3 KB (+69.68%) Avg: 19.6053ms (-1.15%) Median: 19.4543ms (-0.38%) [19.3056ms .. 24.0547ms] terms_many Memory: 1.8 MB Avg: 21.2886ms (-6.28%) Median: 21.1287ms (-6.65%) [20.6640ms .. 24.6144ms] terms_many_top_1000 Memory: 2.6 MB Avg: 23.4869ms (-85.53%) Median: 23.3393ms (-85.61%) [22.7789ms .. 25.0896ms] terms_many_order_by_term Memory: 1.8 MB Avg: 21.7437ms (-7.78%) Median: 21.6272ms (-7.66%) [21.0409ms .. 23.6517ms] terms_many_with_top_hits Memory: 13.1 MB Avg: 43.7926ms (-2.76%) Median: 44.3602ms (+0.01%) [37.8039ms .. 51.0451ms] terms_many_with_avg_sub_agg Memory: 7.5 MB Avg: 34.6307ms (+3.72%) Median: 33.4522ms (+1.16%) [32.4418ms .. 41.4196ms] terms_many_json_mixed_type_with_avg_sub_agg Memory: 7.4 MB Avg: 46.4318ms (+1.16%) Median: 46.4050ms (+2.03%) [44.5986ms .. 48.5142ms] terms_few_with_cardinality_agg Memory: 680.0 KB (-0.04%) Avg: 35.4410ms (+2.05%) Median: 35.1384ms (+1.19%) [34.4402ms .. 39.1082ms] range_agg_with_term_agg_few Memory: 45.7 KB (+39.44%) Avg: 22.7760ms (+0.44%) Median: 22.5152ms (-0.35%) [22.3078ms .. 26.1567ms] range_agg_with_term_agg_many Memory: 1.8 MB Avg: 25.7696ms (-4.45%) Median: 25.4009ms (-5.61%) [24.7874ms .. 29.6434ms] multivalue terms_few Memory: 244.4 KB Avg: 15.1253ms (-2.85%) Median: 15.0988ms (-0.54%) [14.8790ms .. 15.8193ms] terms_many Memory: 6.9 MB (-0.00%) Avg: 26.3019ms (-6.24%) Median: 26.3662ms (-4.94%) [21.3553ms .. 31.0564ms] terms_many_top_1000 Memory: 6.9 MB Avg: 29.5212ms (-72.90%) Median: 29.4257ms (-72.84%) [24.2645ms .. 35.1607ms] terms_many_order_by_term Memory: 6.9 MB Avg: 28.6076ms (-4.93%) Median: 28.1059ms (-6.64%) [24.0845ms .. 34.1493ms] terms_many_with_top_hits Memory: 58.3 MB Avg: 570.1548ms (+1.52%) Median: 572.7759ms (+0.53%) [525.9567ms .. 617.0862ms] terms_many_with_avg_sub_agg Memory: 27.8 MB Avg: 305.5207ms (+0.24%) Median: 296.0101ms (-0.22%) [277.8579ms .. 373.5914ms] terms_many_json_mixed_type_with_avg_sub_agg Memory: 42.0 MB (-0.00%) Avg: 324.7342ms (-2.51%) Median: 319.0025ms (-2.58%) [298.7122ms .. 368.6144ms] terms_few_with_cardinality_agg Memory: 10.8 MB Avg: 151.6126ms (-2.54%) Median: 149.0616ms (-0.32%) [136.5592ms .. 181.8942ms] range_agg_with_term_agg_few Memory: 248.2 KB Avg: 49.5225ms (+3.11%) Median: 48.3994ms (+3.18%) [46.4134ms .. 60.5989ms] range_agg_with_term_agg_many Memory: 6.9 MB Avg: 85.9824ms (-3.66%) Median: 78.4266ms (-3.85%) [64.1231ms .. 128.5279ms]	2024-07-03 12:42:59 +08:00
PSeitz	56d79cb203	fix cardinality aggregation performance (#2446 ) * fix cardinality aggregation performance fix cardinality performance by fetching multiple terms at once. This avoids decompressing the same block and keeps the buffer state between terms. add cardinality aggregation benchmark bump rust version to 1.66 Performance comparison to before (AllQuery) ``` full cardinality_agg Memory: 3.5 MB (-0.00%) Avg: 21.2256ms (-97.78%) Median: 21.0042ms (-97.82%) [20.4717ms .. 23.6206ms] terms_few_with_cardinality_agg Memory: 10.6 MB Avg: 81.9293ms (-97.37%) Median: 81.5526ms (-97.38%) [79.7564ms .. 88.0374ms] dense cardinality_agg Memory: 3.6 MB (-0.00%) Avg: 25.9372ms (-97.24%) Median: 25.7744ms (-97.25%) [24.7241ms .. 27.8793ms] terms_few_with_cardinality_agg Memory: 10.6 MB Avg: 93.9897ms (-96.91%) Median: 92.7821ms (-96.94%) [90.3312ms .. 117.4076ms] sparse cardinality_agg Memory: 895.4 KB (-0.00%) Avg: 22.5113ms (-95.01%) Median: 22.5629ms (-94.99%) [22.1628ms .. 22.9436ms] terms_few_with_cardinality_agg Memory: 680.2 KB Avg: 26.4250ms (-94.85%) Median: 26.4135ms (-94.86%) [26.3210ms .. 26.6774ms] ``` * clippy * assert for sorted ordinals	2024-07-02 15:29:00 +08:00
Paul Masurel	0f4c2e27cf	Fixes bug that causes out-of-order sstable key. (#2445 ) The previous way to address the problem was to replace \u{0000} with 0 in different places. This logic had several flaws: Done on the serializer side (like it was for the columnar), there was a collision problem. If a document in the segment contained a json field with a \0 and antoher doc contained the same json field but `0` then we were sending the same field path twice to the serializer. Another option would have been to normalizes all values on the writer side. This PR simplifies the logic and simply ignore json path containing a \0, both in the columnar and the inverted index. Closes #2442	2024-07-01 15:40:07 +08:00
落叶乌龟	f9ae295507	feat(query): Make `BooleanQuery` supports `minimum_number_should_match` (#2405 ) * feat(query): Make `BooleanQuery` supports `minimum_number_should_match`. see issue #2398 In this commit, a novel scorer named DisjunctionScorer is introduced, which performs the union of inverted chains with the minimal required elements. BTW, it's implemented via a min-heap. Necessary modifications on `BooleanQuery` and `BooleanWeight` are performed as well. * fixup! fix test * fixup!: refactor code. 1. More meaningful names. 2. Add Cache for `Disjunction`'s scorers, and fix bug. 3. Optimize `BooleanWeight::complex_scorer` Thanks Paul Masurel <paul@quickwit.io> * squash!: come up with better variable naming. * squash!: fix naming issues. * squash!: fix typo. * squash!: Remove CombinationMethod::FullIntersection	2024-07-01 15:39:41 +08:00
Raphael Coeffic	d9db5302d9	feat: cardinality aggregation (#2337 ) * WiP: cardinality aggregation * Collect unique entries first, then insert into HyperLogLog * Handle `missing` * Hybrid approach * Review changes - insert `missing` value at most once - `term_id` -> `term_ord` - iterate directly over entries without collecting first * Use salted hasher to include column type * fix: formatting * More review fixes * Add cardinality to test_aggregation_flushing * Formatting	2024-07-01 07:49:42 +08:00
Paul Masurel	e453848134	Recycling buffer in PrefixPhraseScorer (#2443 )	2024-06-24 17:11:53 +09:00
PSeitz	59084143ef	use optional index in multivalued index (#2439 ) * use optional index in multivalued index For mostly empty multivalued indices there was a large overhead during creation when iterating all docids. This is alleviated by placing an optional index in the multivalued index to mark documents that have values. There's some performance overhead when accessing values in a multivalued index. The accessing cost is now optional index + multivalue index. The sparse codec performs relatively bad with the binary_search when accessing data. This is reflected in the benchmarks below. This changes the format of columnar to v2, but code is added to handle the v1 formats. ``` Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4) multi sparse 1/13 access_values_for_doc Avg: 42.8946ms (+241.80%) Median: 42.8869ms (+244.10%) [42.7484ms .. 43.1074ms] access_first_vals Avg: 42.8022ms (+421.93%) Median: 42.7553ms (+439.84%) [42.6794ms .. 43.7404ms] multi 2x access_values_for_doc Avg: 31.1244ms (+24.17%) Median: 30.8339ms (+23.46%) [30.7192ms .. 33.6059ms] access_first_vals Avg: 24.3070ms (+70.92%) Median: 24.0966ms (+70.18%) [23.9328ms .. 26.4851ms] sparse 1/13 access_values_for_doc Avg: 42.2490ms (+0.61%) Median: 42.2346ms (+2.28%) [41.8988ms .. 43.7821ms] access_first_vals Avg: 43.6272ms (+0.23%) Median: 43.6197ms (+1.78%) [43.4920ms .. 43.9009ms] dense 1/12 access_values_for_doc Avg: 8.6184ms (+23.18%) Median: 8.6126ms (+23.78%) [8.5843ms .. 8.7527ms] access_first_vals Avg: 6.8112ms (+4.47%) Median: 6.8002ms (+4.55%) [6.7887ms .. 6.8991ms] full access_values_for_doc Avg: 9.4073ms (-5.09%) Median: 9.4023ms (-2.23%) [9.3694ms .. 9.4568ms] access_first_vals Avg: 4.9531ms (+6.24%) Median: 4.9502ms (+7.85%) [4.9423ms .. 4.9718ms] ``` ``` Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f) merge_multi 2x_and_multi 2x Avg: 20.2280ms (+34.33%) Median: 20.1829ms (+35.33%) [19.9933ms .. 20.8806ms] merge_multi sparse 1/13_and_multi sparse 1/13 Avg: 0.8961ms (-78.04%) Median: 0.8943ms (-77.61%) [0.8899ms .. 0.9272ms] merge_dense 1/12_and_dense 1/12 Avg: 0.6619ms (-1.26%) Median: 0.6616ms (+2.20%) [0.6473ms .. 0.6837ms] merge_sparse 1/13_and_sparse 1/13 Avg: 0.5508ms (-0.85%) Median: 0.5508ms (+2.80%) [0.5420ms .. 0.5634ms] merge_sparse 1/13_and_dense 1/12 Avg: 0.6046ms (-4.64%) Median: 0.6038ms (+2.80%) [0.5939ms .. 0.6296ms] merge_multi sparse 1/13_and_dense 1/12 Avg: 0.9111ms (-83.48%) Median: 0.9063ms (-83.50%) [0.9047ms .. 0.9663ms] merge_multi sparse 1/13_and_sparse 1/13 Avg: 0.8451ms (-89.49%) Median: 0.8428ms (-89.43%) [0.8411ms .. 0.8563ms] merge_multi 2x_and_dense 1/12 Avg: 10.6624ms (-4.82%) Median: 10.6568ms (-4.49%) [10.5738ms .. 10.8353ms] merge_multi 2x_and_sparse 1/13 Avg: 10.6336ms (-22.95%) Median: 10.5925ms (-22.33%) [10.5149ms .. 11.5657ms] ``` * Update columnar/src/columnar/format_version.rs Co-authored-by: Paul Masurel <paul@quickwit.io> * Update columnar/src/column_index/mod.rs Co-authored-by: Paul Masurel <paul@quickwit.io> --------- Co-authored-by: Paul Masurel <paul@quickwit.io>	2024-06-19 14:54:12 +08:00
PSeitz	511b027350	update columnar bench (#2438 ) * update columnar bench * fix compile	2024-06-14 10:42:35 +08:00
Philippe Noël	322f47eb47	Add ParadeDB to Companies List (#1 ) (#2437 ) * Add ParadeDB logo	2024-06-14 09:12:58 +09:00
PSeitz	72f61ff89c	remove index sorting (#2434 ) closes https://github.com/quickwit-oss/tantivy/issues/2352	2024-06-13 15:51:53 +08:00
PSeitz	a141c3ec59	add columnar format compatibiliy tests (#2433 ) * add columnar format compatibiliy tests * always try to write current format	2024-06-13 15:04:52 +08:00
PSeitz	e90e7a25ae	add access benchmark for columnar (#2432 )	2024-06-12 14:29:15 +08:00
PSeitz	c3b92a5412	fix compiler warning, cleanup (#2393 ) fix compiler warning for missing feature flag remove unused variables cleanup unused methods	2024-06-11 16:03:50 +08:00
PSeitz	2f55511064	extend indexwriter proptests (#2342 ) * index random values in proptest * add proptest with multiple docs	2024-06-11 16:02:57 +08:00
trinity-1686a	08b9fc0b31	fix de-escaping too much in query parser (#2427 ) * fix de-escaping too much in query parser	2024-06-10 11:19:01 +02:00
PSeitz	714f363d43	add bench & test for columnar merging (#2428 ) * add merge columnar proptest * add columnar merge benchmark	2024-06-10 16:26:16 +08:00
PSeitz	93ff7365b0	reduce top hits aggregation memory consumption (#2426 ) move request structure out of top hits aggregation collector and use from the passed structure instead full terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 425.9680ms (-21.38%) Median: 415.1097ms (-23.56%) [395.5303ms .. 484.6325ms] dense terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 440.0817ms (-19.68%) Median: 432.2286ms (-21.10%) [403.5632ms .. 497.7541ms] sparse terms_many_with_top_hits Memory: 13.1 MB (-49.31%) Avg: 33.3568ms (-32.19%) Median: 33.0834ms (-31.86%) [32.5126ms .. 35.7397ms] multivalue terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 414.2340ms (-25.44%) Median: 413.4144ms (-25.64%) [403.9919ms .. 430.3170ms]	2024-06-06 22:32:58 +08:00
Adam Reichold	8151925068	Panicking in spawned Rayon tasks will abort the process by default. (#2409 )	2024-06-04 17:04:30 +09:00
dependabot[bot]	b960e40bc8	Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423 ) Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version. - [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases) - [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0) --- updated-dependencies: - dependency-name: sketches-ddsketch dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-06-04 15:50:23 +08:00
giovannicuccu	1095c9b073	Issue 1787 extended stats (#2247 ) * first version of extended stats along with its tests * using IntermediateExtendStats instead of IntermediateStats with all tests passing * Created struct for request and response * first test with extended_stats * kahan summation and tests with approximate equality * version ready for merge * removed approx dependency * refactor for using ExtendedStats only when needed * interim version * refined version with code formatted * refactored a struct * cosmetic refactor * fix after merge * fix format * added extended_stat bench * merge and new benchmark for extended stats * split stat segment collectors * wrapped intermediate extended stat with a box to limit memory usage * Revert "wrapped intermediate extended stat with a box to limit memory usage" This reverts commit `5b4aa9f393`. * some code reformat, commented kahan summation * refactor after review * refactor after code review * fix after incorrectly restoring kahan summation * modifications for code review + bug fix in merge_fruit * refactor assert_nearly_equals macro * update after code review --------- Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>	2024-06-04 14:25:17 +08:00
PSeitz	c0686515a9	update one_shot (#2420 )	2024-05-31 11:07:35 +08:00
trinity-1686a	455156f51c	improve query parser (#2416 ) * support escape sequence in more place and fix bug with singlequoted strings * add query parser test for range query on default field	2024-05-30 17:29:27 +02:00