skip estimate phase for merge multivalue index

precompute stats for merge multivalue index + disable Line encoding for multivalue index. That combination allows to skip the first estimation pass. This gives up to 2x on merge performance on multivalue indices. This change may decrease compression as Line is very good compressible for documents, which have a fixed amount of values in each doc. The line codec should be replaced. ``` merge_multi_and_multi Avg: 22.7880ms (-47.15%) Median: 22.5469ms (-47.38%) [22.3691ms .. 25.8392ms] merge_dense_and_dense Avg: 14.4398ms (+2.18%) Median: 14.2465ms (+0.74%) [14.1620ms .. 16.1270ms] merge_sparse_and_sparse Avg: 10.6559ms (+1.10%) Median: 10.6318ms (+0.91%) [10.5527ms .. 11.2848ms] merge_sparse_and_dense Avg: 12.4886ms (+1.52%) Median: 12.4044ms (+0.84%) [12.3261ms .. 13.9439ms] merge_multi_and_dense Avg: 25.6686ms (-45.56%) Median: 25.4851ms (-45.84%) [25.1618ms .. 27.6226ms] merge_multi_and_sparse Avg: 24.3278ms (-47.00%) Median: 24.1917ms (-47.34%) [23.7159ms .. 27.0513ms] ```
fix compiler warning, cleanup (#2393 )
2026-06-15 15:00:42 +00:00 · 2024-06-11 20:22:00 +08:00 · 2024-06-11 16:03:50 +08:00 · 2024-06-11 16:02:57 +08:00 · 2024-06-10 11:19:01 +02:00 · 2024-06-10 16:26:16 +08:00
61 changed files with 2299 additions and 532 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,8 +15,7 @@ rust-version = "1.63"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
-# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
-oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "c10a3ba" }
+oneshot = "0.1.7"
 base64 = "0.22.0"
 byteorder = "1.4.3"
 crc32fast = "1.3.2"
@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
 tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
 common = { version = "0.7", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
-sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
+sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
 futures-util = { version = "0.3.28", optional = true }
 fnv = "1.0.7"

--- a/benches/agg_bench.rs
+++ b/benches/agg_bench.rs
@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
    register!(group, average_f64);
    register!(group, average_f64_u64);
    register!(group, stats_f64);
+    register!(group, extendedstats_f64);
    register!(group, percentiles_f64);
    register!(group, terms_few);
    register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
    });
    exec_term_with_agg(index, agg_req)
 }
-
+fn extendedstats_f64(index: &Index) {
+    let agg_req = json!({
+        "extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
+    });
+    exec_term_with_agg(index, agg_req)
+}
 fn percentiles_f64(index: &Index) {
    let agg_req = json!({
      "mypercentiles": {
@@ -349,7 +355,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
    let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();

    let many_terms_data = (0..150_000)
-        .map(|num| format!("author{}", num))
+        .map(|num| format!("author{num}"))
        .collect::<Vec<_>>();
    {
        let mut rng = StdRng::from_seed([1u8; 32]);
--- a/benches/index-bench.rs
+++ b/benches/index-bench.rs
@@ -141,12 +141,12 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
                let parse_json = false;
                // for parse_json in [false, true] {
                let suffix = if parse_json {
-                    format!("{}-with-json-parsing", suffix)
+                    format!("{suffix}-with-json-parsing")
                } else {
                    suffix.to_string()
                };

-                let bench_name = format!("{}{}", prefix, suffix);
+                let bench_name = format!("{prefix}{suffix}");
                group.bench_function(bench_name, |b| {
                    benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
                });
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
 proptest = "1"
 more-asserts = "0.3.1"
 rand = "0.8"
+binggan = "0.8.1"
+
+[[bench]]
+name = "bench_merge"
+harness = false
+

 [features]
 unstable = []
--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -0,0 +1,97 @@
+#![feature(test)]
+extern crate test;
+
+use core::fmt;
+use std::fmt::{Display, Formatter};
+
+use binggan::{black_box, BenchRunner};
+use tantivy_columnar::*;
+
+enum Card {
+    Multi,
+    Sparse,
+    Dense,
+}
+impl Display for Card {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            Card::Multi => write!(f, "multi"),
+            Card::Sparse => write!(f, "sparse"),
+            Card::Dense => write!(f, "dense"),
+        }
+    }
+}
+
+const NUM_DOCS: u32 = 1_000_000;
+
+fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
+    use tantivy_columnar::ColumnarWriter;
+
+    let mut columnar_writer = ColumnarWriter::default();
+
+    match card {
+        Card::Multi => {
+            columnar_writer.record_numerical(0, "price", 10u64);
+            columnar_writer.record_numerical(0, "price", 10u64);
+        }
+        _ => {}
+    }
+
+    for i in 0..num_docs {
+        match card {
+            Card::Multi | Card::Sparse => {
+                if i % 8 == 0 {
+                    columnar_writer.record_numerical(i, "price", i as u64);
+                }
+            }
+            Card::Dense => {
+                if i % 6 == 0 {
+                    columnar_writer.record_numerical(i, "price", i as u64);
+                }
+            }
+        }
+    }
+
+    let mut wrt: Vec<u8> = Vec::new();
+    columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
+
+    ColumnarReader::open(wrt).unwrap()
+}
+fn main() {
+    let mut inputs = Vec::new();
+
+    let mut add_combo = |card1: Card, card2: Card| {
+        inputs.push((
+            format!("merge_{card1}_and_{card2}"),
+            vec![
+                generate_columnar(card1, NUM_DOCS),
+                generate_columnar(card2, NUM_DOCS),
+            ],
+        ));
+    };
+
+    add_combo(Card::Multi, Card::Multi);
+    add_combo(Card::Dense, Card::Dense);
+    add_combo(Card::Sparse, Card::Sparse);
+    add_combo(Card::Sparse, Card::Dense);
+    add_combo(Card::Multi, Card::Dense);
+    add_combo(Card::Multi, Card::Sparse);
+
+    let runner: BenchRunner = BenchRunner::new();
+    let mut group = runner.new_group();
+    for (input_name, columnar_readers) in inputs.iter() {
+        group.register_with_input(
+            input_name,
+            columnar_readers,
+            move |columnar_readers: &Vec<ColumnarReader>| {
+                let mut out = vec![];
+                let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
+                let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
+
+                merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
+                black_box(out);
+            },
+        );
+    }
+    group.run();
+}
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -73,14 +73,18 @@ fn detect_cardinality(
 pub fn merge_column_index<'a>(
    columns: &'a [ColumnIndex],
    merge_row_order: &'a MergeRowOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    // For simplification, we do not try to detect whether the cardinality could be
    // downgraded thanks to deletes.
    let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
    match merge_row_order {
-        MergeRowOrder::Stack(stack_merge_order) => {
-            merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
-        }
+        MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
+            columns,
+            cardinality_after_merge,
+            stack_merge_order,
+            num_values,
+        ),
        MergeRowOrder::Shuffled(complex_merge_order) => {
            merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
        }
@@ -167,8 +171,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
        Cardinality::Multivalued => {
            let multivalue_start_index =
                merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
-            SerializableColumnIndex::Multivalued(multivalue_start_index)
+            SerializableColumnIndex::Multivalued {
+                indices: multivalue_start_index,
+                stats: None,
+            }
        }
    }
 }
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -1,6 +1,8 @@
 use std::iter;
+use std::num::NonZeroU64;

 use crate::column_index::{SerializableColumnIndex, Set};
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};

@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
    columns: &'a [ColumnIndex],
    cardinality_after_merge: Cardinality,
    stack_merge_order: &'a StackMergeOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    match cardinality_after_merge {
        Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
                columns,
                stack_merge_order,
            };
-            SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(stacked_multivalued_index),
+                stats: Some(ColumnStats {
+                    gcd: NonZeroU64::new(1).unwrap(),
+                    // The values in the multivalue index are the positions of the values
+                    min_value: 0,
+                    max_value: num_values as u64,
+                    // This is num docs, but it starts at 0 so we need +1
+                    num_rows: stack_merge_order.num_rows() + 1,
+                }),
+            }
        }
    }
 }
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -6,20 +6,29 @@ use std::sync::Arc;
 use common::OwnedBytes;

 use crate::column_values::{
-    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
+    load_u64_based_column_values, serialize_u64_based_column_values,
+    serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
 };
 use crate::iterable::Iterable;
 use crate::{DocId, RowId};

 pub fn serialize_multivalued_index(
    multivalued_index: &dyn Iterable<RowId>,
+    stats: Option<ColumnStats>,
    output: &mut impl Write,
 ) -> io::Result<()> {
-    serialize_u64_based_column_values(
-        multivalued_index,
-        &[CodecType::Bitpacked, CodecType::Linear],
-        output,
-    )?;
+    if let Some(stats) = stats {
+        // TODO: Add something with higher compression that doesn't require a full scan upfront
+        let estimator = CodecType::Bitpacked.estimator();
+        assert!(!estimator.requires_full_scan());
+        serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
+    } else {
+        serialize_u64_based_column_values(
+            multivalued_index,
+            &[CodecType::Bitpacked, CodecType::Linear],
+            output,
+        )?;
+    }
    Ok(())
 }

@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
 impl MultiValueIndex {
    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
        let mut buffer = Vec::new();
-        serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
+        serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
        let bytes = OwnedBytes::new(buffer);
        open_multivalued_index(bytes).unwrap()
    }
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
        } = row_addr_from_row_id(doc_id);
        let block_meta = self.block_metas[block_id as usize];
        let block = self.block(block_meta);
+
        let block_offset_row_id = match block {
            Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
            Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
 use crate::column_index::ColumnIndex;
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, RowId};

@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
        non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
        num_rows: RowId,
    },
-    // TODO remove the Arc<dyn> apart from serialization this is not
-    // dynamic at all.
-    Multivalued(Box<dyn Iterable<RowId> + 'a>),
+    Multivalued {
+        /// Iterator emitting the indices for the index
+        indices: Box<dyn Iterable<RowId> + 'a>,
+        /// In the merge case we can precompute the column stats
+        stats: Option<ColumnStats>,
+    },
 }

 impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
        match self {
            SerializableColumnIndex::Full => Cardinality::Full,
            SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
-            SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
+            SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
        }
    }
 }
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
            non_null_row_ids,
            num_rows,
        } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
-        SerializableColumnIndex::Multivalued(multivalued_index) => {
-            serialize_multivalued_index(&*multivalued_index, &mut output)?
-        }
+        SerializableColumnIndex::Multivalued {
+            indices: multivalued_index,
+            stats,
+        } => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
    }
    let column_index_num_bytes = output.written_bytes() as u32;
    Ok(column_index_num_bytes)
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -32,7 +32,8 @@ pub use u128_based::{
 };
 pub use u64_based::{
    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
-    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
+    serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
+    ALL_U64_CODEC_TYPES,
 };
 pub use vec_column::VecColumn;

--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
        bit_packer.close(wrt)?;
        Ok(())
    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Bitpacked
+    }
 }

 pub struct BitpackedCodec;
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {

        Ok(())
    }
+
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::BlockwiseLinear
+    }
 }

 pub struct BlockwiseLinearCodec;
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
            self.collect_before_line_estimation(value);
        }
    }
+    fn requires_full_scan(&self) -> bool {
+        true
+    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Linear
+    }
 }

 impl LinearCodecEstimator {
--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
    /// This method will be called for each element of the column during
    /// `estimation`.
    fn collect(&mut self, value: u64);
-    /// Finalizes the first pass phase.
+    /// Returns true if the estimator needs a full pass over the column before serialization
+    fn requires_full_scan(&self) -> bool {
+        false
+    }
+    fn codec_type(&self) -> CodecType;
    fn finalize(&mut self) {}
    /// Returns an accurate estimation of the number of bytes that will
    /// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
    wrt: &mut dyn Write,
 ) -> io::Result<()> {
    let mut stats_collector = StatsCollector::default();
-    let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
-        Vec::with_capacity(codec_types.len());
+    let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
    for &codec_type in codec_types {
-        estimators.push((codec_type, codec_type.estimator()));
+        estimators.push(codec_type.estimator());
    }
    for val in vals.boxed_iter() {
        let val_u64 = val.to_u64();
        stats_collector.collect(val_u64);
-        for (_, estimator) in &mut estimators {
+        for estimator in &mut estimators {
            estimator.collect(val_u64);
        }
    }
-    for (_, estimator) in &mut estimators {
+    for estimator in &mut estimators {
        estimator.finalize();
    }
    let stats = stats_collector.stats();
-    let (_, best_codec, best_codec_estimator) = estimators
+    let (_, best_codec) = estimators
        .into_iter()
-        .flat_map(|(codec_type, estimator)| {
+        .flat_map(|estimator| {
            let num_bytes = estimator.estimate(&stats)?;
-            Some((num_bytes, codec_type, estimator))
+            Some((num_bytes, estimator))
        })
-        .min_by_key(|(num_bytes, _, _)| *num_bytes)
+        .min_by_key(|(num_bytes, _)| *num_bytes)
        .ok_or_else(|| {
            io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
        })?;
-    best_codec.to_code().serialize(wrt)?;
-    best_codec_estimator.serialize(
+    serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
+    Ok(())
+}
+
+/// Serializes a given column of u64-mapped values.
+/// The codec estimator needs to be collected fully for the Line codec before calling this.
+pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
+    vals: &dyn Iterable<T>,
+    codec: Box<dyn ColumnCodecEstimator>,
+    stats: ColumnStats,
+    wrt: &mut dyn Write,
+) -> io::Result<()> {
+    codec.codec_type().to_code().serialize(wrt)?;
+    codec.serialize(
        &stats,
        &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
        wrt,
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -3,7 +3,7 @@ mod merge_mapping;
 mod term_merger;

 use std::collections::{BTreeMap, HashSet};
-use std::io;
+use std::io::{self};
 use std::net::Ipv6Addr;
 use std::sync::Arc;

@@ -156,8 +156,15 @@ fn merge_column(
                    column_values.push(None);
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
                }
            }

-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
                    }
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = bytes_columns
+                .iter()
+                .map(|vals| {
+                    vals.as_ref()
+                        .map(|idx| idx.term_ord_column.values.num_vals())
+                        .unwrap_or(0)
+                })
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
        }
    }
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
            let multivalued_index = multivalued_index_builder.finish(num_rows);
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: Default::default(), // TODO: implement stats for u128
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
            if sort_values_within_row {
                sort_values_within_row_in_place(multivalued_index, values);
            }
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: None,
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u64(
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -738,35 +738,22 @@ proptest! {
    #![proptest_config(ProptestConfig::with_cases(1000))]
    #[test]
    fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
-        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
-            .map(|docs| build_columnar(&docs[..]))
-            .collect::<Vec<_>>();
-        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-        let mut output: Vec<u8> = Vec::new();
-        let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
-        crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
-        let merged_columnar = ColumnarReader::open(output).unwrap();
-        let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
-        let expected_merged_columnar = build_columnar(&concat_rows[..]);
-        assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+        test_columnar_docs(columnar_docs);
    }
 }

-#[test]
-fn test_columnar_merging_empty_columnar() {
-    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
-        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
    let columnar_readers: Vec<ColumnarReader> = columnar_docs
        .iter()
        .map(|docs| build_columnar(&docs[..]))
        .collect::<Vec<_>>();
    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
+    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
    crate::merge_columnar(
        &columnar_readers_arr[..],
        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
+        stack_merge_order,
        &mut output,
    )
    .unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
 }

+#[test]
+fn test_columnar_merging_empty_columnar() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
+        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+    test_columnar_docs(columnar_docs);
+}
+#[test]
+fn test_columnar_merging_simple() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
+        vec![],
+        vec![vec![
+            ("c1", ColumnValue::Numerical(0u64.into())),
+            ("c1", ColumnValue::Numerical(0u64.into())),
+        ]],
+    ];
+    test_columnar_docs(columnar_docs);
+}
+
 #[test]
 fn test_columnar_merging_number_columns() {
    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
            vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
        ],
    ];
-    let columnar_readers: Vec<ColumnarReader> = columnar_docs
-        .iter()
-        .map(|docs| build_columnar(&docs[..]))
-        .collect::<Vec<_>>();
-    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
-    crate::merge_columnar(
-        &columnar_readers_arr[..],
-        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
-        &mut output,
-    )
-    .unwrap();
-    let merged_columnar = ColumnarReader::open(output).unwrap();
-    let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
-        columnar_docs.iter().flatten().cloned().collect();
-    let expected_merged_columnar = build_columnar(&concat_rows[..]);
-    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+    test_columnar_docs(columnar_docs);
 }

 // TODO add non trivial remap and merge
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
 [dev-dependencies]
 proptest = "1.0.0"
 rand = "0.8.4"
+
+[features]
+unstable = [] # useful for benches.
--- a/examples/date_time_field.rs
+++ b/examples/date_time_field.rs
@@ -4,7 +4,7 @@

 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
-use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING};
+use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
 use tantivy::{Index, IndexWriter, TantivyDocument};

 fn main() -> tantivy::Result<()> {
@@ -64,6 +64,7 @@ fn main() -> tantivy::Result<()> {
            assert!(retrieved_doc
                .get_first(occurred_at)
                .unwrap()
+                .as_value()
                .as_datetime()
                .is_some(),);
            assert_eq!(
--- a/examples/index_from_multiple_threads.rs
+++ b/examples/index_from_multiple_threads.rs
@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
                        debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
                        limbs and branches that arch over the pool"
                    ))?;
-            println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
+            println!("add doc {i} from thread 1 - opstamp {opstamp}");
            thread::sleep(Duration::from_millis(20));
        }
        Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
                    body => "Some great book description..."
                ))?
            };
-            println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
+            println!("add doc {i} from thread 2 - opstamp {opstamp}");
            thread::sleep(Duration::from_millis(10));
        }
        Result::<(), TantivyError>::Ok(())
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::iter::once;

 use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
 // special characters.
 const SPECIAL_CHARS: &[char] = &[
-    '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
+    '+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
 ];

 /// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
    )(inp)
 }

+const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
+
+fn interpret_escape(source: &str) -> String {
+    let mut res = String::with_capacity(source.len());
+    let mut in_escape = false;
+    let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
+
+    for c in source.chars() {
+        if in_escape {
+            if !require_escape(c) {
+                // we re-add the escape sequence
+                res.push('\\');
+            }
+            res.push(c);
+            in_escape = false;
+        } else if c == '\\' {
+            in_escape = true;
+        } else {
+            res.push(c);
+        }
+    }
+    res
+}
+
 /// Consume a word outside of any context.
 // TODO should support escape sequences
-fn word(inp: &str) -> IResult<&str, &str> {
+fn word(inp: &str) -> IResult<&str, Cow<str>> {
    map_res(
        recognize(tuple((
-            satisfy(|c| {
-                !c.is_whitespace()
-                    && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            }),
-            many0(satisfy(|c: char| {
-                !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            })),
+            alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
+            )),
+            many0(alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
+            ))),
        ))),
        |s| match s {
            "OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
-            _ => Ok(s),
+            s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
+            s => Ok(Cow::Borrowed(s)),
        },
    )(inp)
 }

-fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
-    |inp| {
-        opt_i_err(
-            preceded(
-                multispace0,
-                recognize(many1(satisfy(|c| {
-                    !c.is_whitespace() && !delimiter.contains(c)
-                }))),
+fn word_infallible(
+    delimiter: &str,
+    emit_error: bool,
+) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
+    // emit error is set when receiving an unescaped `:` should emit an error
+
+    move |inp| {
+        map(
+            opt_i_err(
+                preceded(
+                    multispace0,
+                    recognize(many1(alt((
+                        preceded(char::<&str, _>('\\'), anychar),
+                        satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
+                    )))),
+                ),
+                "expected word",
            ),
-            "expected word",
+            |(opt_s, mut errors)| match opt_s {
+                Some(s) => {
+                    if emit_error
+                        && (s
+                            .as_bytes()
+                            .windows(2)
+                            .any(|window| window[0] != b'\\' && window[1] == b':')
+                            || s.starts_with(':'))
+                    {
+                        errors.push(LenientErrorInternal {
+                            pos: inp.len(),
+                            message: "parsed possible invalid field as term".to_string(),
+                        });
+                    }
+                    if s.contains('\\') {
+                        (Some(Cow::Owned(interpret_escape(s))), errors)
+                    } else {
+                        (Some(Cow::Borrowed(s)), errors)
+                    }
+                }
+                None => (None, errors),
+            },
        )(inp)
    }
 }
@@ -159,7 +216,7 @@ fn simple_term_infallible(
                (value((), char('\'')), simple_quotes),
            ),
            // numbers are parsed with words in this case, as we allow string starting with a -
-            map(word_infallible(delimiter), |(text, errors)| {
+            map(word_infallible(delimiter, true), |(text, errors)| {
                (text.map(|text| (Delimiter::None, text.to_string())), errors)
            }),
        )(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
        |((field_name, _, leaf), mut errors)| {
            (
                leaf.map(|leaf| {
-                    if matches!(&leaf, UserInputLeaf::Literal(literal)
-                            if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
-                        && field_name.is_none()
-                    {
-                        errors.push(LenientErrorInternal {
-                            pos: inp.len(),
-                            message: "parsed possible invalid field as term".to_string(),
-                        });
-                    }
                    if matches!(&leaf, UserInputLeaf::Literal(literal)
                            if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
                        && field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
        tuple_infallible((
            opt_i(anychar),
            space0_infallible,
-            word_infallible("]}"),
+            word_infallible("]}", false),
            space1_infallible,
            opt_i_err(
                terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
                "missing keyword TO",
            ),
-            word_infallible("]}"),
+            word_infallible("]}", false),
            opt_i_err(one_of("]}"), "missing range delimiter"),
        )),
        |(
            (lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
            errs,
        )| {
-            let lower_bound = match (lower_bound_kind, lower) {
+            let lower_bound = match (lower_bound_kind, lower.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                // if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
                _ => unreachable!("precondition failed, range did not start with [ or {{"),
            };
-            let upper_bound = match (upper_bound_kind, upper) {
+            let upper_bound = match (upper_bound_kind, upper.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
            (
                (
                    value((), tag(">=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag(">")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
        test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");

        test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
+
+        test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
+        test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
+        test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
    }

    #[test]
@@ -1590,5 +1644,21 @@ mod test {
            r#"myfield:'hello\"happy\'tax'"#,
            r#""myfield":'hello"happy'tax'"#,
        );
+        // we don't process escape sequence for chars which don't require it
+        test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
+    }
+
+    #[test]
+    fn test_queries_with_colons() {
+        test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
+        test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
+        test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
+    }
+
+    #[test]
+    fn test_invalid_field() {
+        test_is_parse_err(r#"!bc:def"#, "!bc:def");
    }
 }
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -34,7 +34,7 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
    PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
 };

@@ -146,6 +146,11 @@ pub enum AggregationVariants {
    /// extracted values.
    #[serde(rename = "stats")]
    Stats(StatsAggregation),
+    /// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
+    /// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
+    /// `std_deviation_sampling`) over the  extracted values.
+    #[serde(rename = "extended_stats")]
+    ExtendedStats(ExtendedStatsAggregation),
    /// Computes the sum of the extracted values.
    #[serde(rename = "sum")]
    Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
            AggregationVariants::Max(max) => vec![max.field_name()],
            AggregationVariants::Min(min) => vec![min.field_name()],
            AggregationVariants::Stats(stats) => vec![stats.field_name()],
+            AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
            AggregationVariants::Sum(sum) => vec![sum.field_name()],
            AggregationVariants::Percentiles(per) => vec![per.field_name()],
            AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
            _ => None,
        }
    }
+    pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
+        match &self {
+            AggregationVariants::TopHits(top_hits) => Some(top_hits),
+            _ => None,
+        }
+    }

    pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
        match &self {
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -11,8 +11,8 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
-    SumAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
+    StatsAggregation, SumAggregation,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
                field: ref field_name,
                ..
            })
+            | ExtendedStats(ExtendedStatsAggregation {
+                field: ref field_name,
+                ..
+            })
            | Sum(SumAggregation {
                field: ref field_name,
                ..
@@ -335,8 +339,8 @@ fn get_missing_val(
        }
        _ => {
            return Err(crate::TantivyError::InvalidArgument(format!(
-                "Missing value {:?} for field {} is not supported for column type {:?}",
-                missing, field_name, column_type
+                "Missing value {missing:?} for field {field_name} is not supported for column \
+                 type {column_type:?}"
            )));
        }
    };
@@ -403,7 +407,7 @@ fn get_dynamic_columns(
        .iter()
        .map(|h| h.open())
        .collect::<io::Result<_>>()?;
-    assert!(!ff_fields.is_empty(), "field {} not found", field_name);
+    assert!(!ff_fields.is_empty(), "field {field_name} not found");
    Ok(cols)
 }

--- a/src/aggregation/agg_result.rs
+++ b/src/aggregation/agg_result.rs
@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};

 use super::bucket::GetDocCount;
-use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
+use super::metric::{
+    ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
+};
 use super::{AggregationError, Key};
 use crate::TantivyError;

@@ -88,6 +90,8 @@ pub enum MetricResult {
    Min(SingleMetricResult),
    /// Stats metric result.
    Stats(Stats),
+    /// ExtendedStats metric result.
+    ExtendedStats(Box<ExtendedStats>),
    /// Sum metric result.
    Sum(SingleMetricResult),
    /// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
            MetricResult::Max(max) => Ok(max.value),
            MetricResult::Min(min) => Ok(min.value),
            MetricResult::Stats(stats) => stats.get_value(agg_property),
+            MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
            MetricResult::Sum(sum) => Ok(sum.value),
            MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
                AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -357,8 +357,7 @@ impl SegmentTermCollector {
    ) -> crate::Result<Self> {
        if field_type == ColumnType::Bytes {
            return Err(TantivyError::InvalidArgument(format!(
-                "terms aggregation is not supported for column type {:?}",
-                field_type
+                "terms aggregation is not supported for column type {field_type:?}"
            )));
        }
        let term_buckets = TermBuckets::default();
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -19,8 +19,8 @@ use super::bucket::{
    GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
-    IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
+    IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
+    IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
        Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
            IntermediateStats::default(),
        )),
+        ExtendedStats(_) => IntermediateAggregationResult::Metric(
+            IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
+        ),
        Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
            IntermediateSum::default(),
        )),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
            IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
        ),
        TopHits(ref req) => IntermediateAggregationResult::Metric(
-            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
+            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
        ),
    }
 }
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
    Min(IntermediateMin),
    /// Intermediate stats result.
    Stats(IntermediateStats),
+    /// Intermediate stats result.
+    ExtendedStats(IntermediateExtendedStats),
    /// Intermediate sum result.
    Sum(IntermediateSum),
    /// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
            IntermediateMetricResult::Stats(intermediate_stats) => {
                MetricResult::Stats(intermediate_stats.finalize())
            }
+            IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
+                MetricResult::ExtendedStats(intermediate_stats.finalize())
+            }
            IntermediateMetricResult::Sum(intermediate_sum) => {
                MetricResult::Sum(intermediate_sum.finalize().into())
            }
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
            ) => {
                stats_left.merge_fruits(stats_right);
            }
+            (
+                IntermediateMetricResult::ExtendedStats(extended_stats_left),
+                IntermediateMetricResult::ExtendedStats(extended_stats_right),
+            ) => {
+                extended_stats_left.merge_fruits(extended_stats_right);
+            }
            (IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
                sum_left.merge_fruits(sum_right);
            }
--- a/src/aggregation/metric/extended_stats.rs
+++ b/src/aggregation/metric/extended_stats.rs
--- a/src/aggregation/metric/mod.rs
+++ b/src/aggregation/metric/mod.rs
@@ -18,6 +18,7 @@

 mod average;
 mod count;
+mod extended_stats;
 mod max;
 mod min;
 mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;

 pub use average::*;
 pub use count::*;
+pub use extended_stats::*;
 pub use max::*;
 pub use min::*;
 pub use percentiles::*;
--- a/src/aggregation/metric/stats.rs
+++ b/src/aggregation/metric/stats.rs
@@ -1,3 +1,5 @@
+use std::fmt::Debug;
+
 use serde::{Deserialize, Serialize};

 use super::*;
@@ -85,13 +87,15 @@ impl Stats {
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateStats {
    /// The number of extracted values.
-    count: u64,
+    pub(crate) count: u64,
    /// The sum of the extracted values.
-    sum: f64,
+    pub(crate) sum: f64,
+    /// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+    pub(crate) delta: f64,
    /// The min value.
-    min: f64,
+    pub(crate) min: f64,
    /// The max value.
-    max: f64,
+    pub(crate) max: f64,
 }

 impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
        Self {
            count: 0,
            sum: 0.0,
+            delta: 0.0,
            min: f64::MAX,
            max: f64::MIN,
        }
@@ -109,7 +114,13 @@ impl IntermediateStats {
    /// Merges the other stats intermediate result into self.
    pub fn merge_fruits(&mut self, other: IntermediateStats) {
        self.count += other.count;
-        self.sum += other.sum;
+
+        // kahan algorithm for sum
+        let y = other.sum - (self.delta + other.delta);
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(other.min);
        self.max = self.max.max(other.max);
    }
@@ -141,9 +152,15 @@ impl IntermediateStats {
    }

    #[inline]
-    fn collect(&mut self, value: f64) {
+    pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
        self.count += 1;
-        self.sum += value;
+
+        // kahan algorithm for sum
+        let y = value - self.delta;
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(value);
        self.max = self.max.max(value);
    }
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {

 #[cfg(test)]
 mod tests {
-
    use serde_json::Value;

    use crate::aggregation::agg_req::{Aggregation, Aggregations};
--- a/src/aggregation/metric/top_hits.rs
+++ b/src/aggregation/metric/top_hits.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 use std::net::Ipv6Addr;

-use columnar::{ColumnarReader, DynamicColumn};
+use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
 use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
 use common::DateTime;
 use regex::Regex;
@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
        ))?;
        if key_order.next().is_some() {
            return Err(serde::de::Error::custom(format!(
-                "Expected exactly one key-value pair in sort parameter of top_hits, found {:?}",
-                key_order
+                "Expected exactly one key-value pair in sort parameter of top_hits, found \
+                 {key_order:?}"
            )));
        }
        Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
    // Replace `*` glob with `.*` regex
    let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
    Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
-        crate::TantivyError::SchemaError(format!(
-            "Invalid regex '{}' in docvalue_fields: {}",
-            glob, e
-        ))
+        crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
    })
 }

 fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
    Err(crate::TantivyError::AggregationError(
        AggregationError::InvalidRequest(format!(
-            "The `{}` parameter is not supported, only `docvalue_fields` is supported in \
-             `top_hits` aggregation",
-            parameter
+            "The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
+             `top_hits` aggregation"
        )),
    ))
 }
 fn unsupported_err(parameter: &str) -> crate::Result<()> {
    Err(crate::TantivyError::AggregationError(
        AggregationError::InvalidRequest(format!(
-            "The `{}` parameter is not supported in the `top_hits` aggregation",
-            parameter
+            "The `{parameter}` parameter is not supported in the `top_hits` aggregation"
        )),
    ))
 }
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
                    .collect::<Vec<_>>();
                assert!(
                    !fields.is_empty(),
-                    "No fields matched the glob '{}' in docvalue_fields",
-                    field
+                    "No fields matched the glob '{field}' in docvalue_fields"
                );
                Ok(fields)
            })
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
            .map(|field| {
                let accessors = accessors
                    .get(field)
-                    .unwrap_or_else(|| panic!("field '{}' not found in accessors", field));
+                    .unwrap_or_else(|| panic!("field '{field}' not found in accessors"));

                let values: Vec<FastFieldValue> = accessors
                    .iter()
@@ -449,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {

 impl TopHitsTopNComputer {
    /// Create a new TopHitsCollector
-    pub fn new(req: TopHitsAggregation) -> Self {
+    pub fn new(req: &TopHitsAggregation) -> Self {
        Self {
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
-            req,
+            req: req.clone(),
        }
    }

@@ -497,7 +491,6 @@ impl TopHitsTopNComputer {
 pub(crate) struct TopHitsSegmentCollector {
    segment_ordinal: SegmentOrdinal,
    accessor_idx: usize,
-    req: TopHitsAggregation,
    top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
 }

@@ -508,7 +501,6 @@ impl TopHitsSegmentCollector {
        segment_ordinal: SegmentOrdinal,
    ) -> Self {
        Self {
-            req: req.clone(),
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
            segment_ordinal,
            accessor_idx,
@@ -517,14 +509,13 @@ impl TopHitsSegmentCollector {
    fn into_top_hits_collector(
        self,
        value_accessors: &HashMap<String, Vec<DynamicColumn>>,
+        req: &TopHitsAggregation,
    ) -> TopHitsTopNComputer {
-        let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
+        let mut top_hits_computer = TopHitsTopNComputer::new(req);
        let top_results = self.top_n.into_vec();

        for res in top_results {
-            let doc_value_fields = self
-                .req
-                .get_document_field_data(value_accessors, res.doc.doc_id);
+            let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
            top_hits_computer.collect(
                DocSortValuesAndFields {
                    sorts: res.feature,
@@ -536,34 +527,15 @@ impl TopHitsSegmentCollector {

        top_hits_computer
    }
-}

-impl SegmentAggregationCollector for TopHitsSegmentCollector {
-    fn add_intermediate_aggregation_result(
-        self: Box<Self>,
-        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
-        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
-    ) -> crate::Result<()> {
-        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
-
-        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
-
-        let intermediate_result =
-            IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
-        results.push(
-            name,
-            IntermediateAggregationResult::Metric(intermediate_result),
-        )
-    }
-
-    fn collect(
+    /// TODO add a specialized variant for a single sort field
+    fn collect_with(
        &mut self,
        doc_id: crate::DocId,
-        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        req: &TopHitsAggregation,
+        accessors: &[(Column<u64>, ColumnType)],
    ) -> crate::Result<()> {
-        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
-        let sorts: Vec<DocValueAndOrder> = self
-            .req
+        let sorts: Vec<DocValueAndOrder> = req
            .sort
            .iter()
            .enumerate()
@@ -588,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
        );
        Ok(())
    }
+}
+
+impl SegmentAggregationCollector for TopHitsSegmentCollector {
+    fn add_intermediate_aggregation_result(
+        self: Box<Self>,
+        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
+    ) -> crate::Result<()> {
+        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
+
+        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+
+        let intermediate_result = IntermediateMetricResult::TopHits(
+            self.into_top_hits_collector(value_accessors, tophits_req),
+        );
+        results.push(
+            name,
+            IntermediateAggregationResult::Metric(intermediate_result),
+        )
+    }
+
+    /// TODO: Consider a caching layer to reduce the call overhead
+    fn collect(
+        &mut self,
+        doc_id: crate::DocId,
+        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
+        self.collect_with(doc_id, tophits_req, accessors)?;
+        Ok(())
+    }

    fn collect_block(
        &mut self,
        docs: &[crate::DocId],
        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
        // TODO: Consider getting fields with the column block accessor.
        for doc in docs {
-            self.collect(*doc, agg_with_accessor)?;
+            self.collect_with(*doc, tophits_req, accessors)?;
        }
        Ok(())
    }
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -158,15 +158,14 @@ use serde::de::{self, Visitor};
 use serde::{Deserialize, Deserializer, Serialize};

 fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
-    let parsed = value.parse::<f64>().map_err(|_err| {
-        de::Error::custom(format!("Failed to parse f64 from string: {:?}", value))
-    })?;
+    let parsed = value
+        .parse::<f64>()
+        .map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;

    // Check if the parsed value is NaN or infinity
    if parsed.is_nan() || parsed.is_infinite() {
        Err(de::Error::custom(format!(
-            "Value is not a valid f64 (NaN or Infinity): {:?}",
-            value
+            "Value is not a valid f64 (NaN or Infinity): {value:?}"
        )))
    } else {
        Ok(parsed)
--- a/src/aggregation/segment_agg_result.rs
+++ b/src/aggregation/segment_agg_result.rs
@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
 use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
 use super::intermediate_agg_result::IntermediateAggregationResults;
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
    SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
    SumAggregation,
 };
 use crate::aggregation::bucket::TermMissingAgg;
-use crate::aggregation::metric::TopHitsSegmentCollector;
+use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};

 pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
    fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
            accessor_idx,
            *missing,
        ))),
+        ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
+            SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
+        )),
        Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Sum,
--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -598,7 +598,7 @@ mod tests {
                let mid = n % 4;
                n /= 4;
                let leaf = n % 5;
-                Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
+                Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
            })
            .collect();
        for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
            vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
                .into_iter()
                .flat_map(|(c, count)| {
-                    let facet = Facet::from(&format!("/facet/{}", c));
+                    let facet = Facet::from(&format!("/facet/{c}"));
                    let doc = doc!(facet_field => facet);
                    iter::repeat(doc).take(count)
                })
@@ -785,7 +785,7 @@ mod tests {
        let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
            .into_iter()
            .flat_map(|(c, count)| {
-                let facet = Facet::from(&format!("/facet/{}", c));
+                let facet = Facet::from(&format!("/facet/{c}"));
                let doc = doc!(facet_field => facet);
                iter::repeat(doc).take(count)
            })
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -871,7 +871,10 @@ mod tests {
    use crate::schema::{Field, Schema, FAST, STORED, TEXT};
    use crate::time::format_description::well_known::Rfc3339;
    use crate::time::OffsetDateTime;
-    use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
+    use crate::{
+        assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
+        SegmentReader,
+    };

    fn make_index() -> crate::Result<Index> {
        let mut schema_builder = Schema::builder();
--- a/src/core/executor.rs
+++ b/src/core/executor.rs
@@ -195,7 +195,7 @@ mod tests {
        let (tx, rx) = crossbeam_channel::bounded::<()>(0);
        let rx = Arc::new(rx);
        let executor = Executor::multi_thread(3, "search-test").unwrap();
-        for i in 0..1000 {
+        for _ in 0..1000 {
            let counter_clone: Arc<AtomicU64> = counter.clone();
            let other_counter_clone: Arc<AtomicU64> = other_counter.clone();

@@ -203,18 +203,18 @@ mod tests {
            let rx_clone2 = rx.clone();
            let fut = executor.spawn_blocking(move || {
                counter_clone.fetch_add(1, Ordering::SeqCst);
-                let () = rx_clone.recv().unwrap();
+                let _ = rx_clone.recv();
            });
            futures.push(fut);
            let other_fut = executor.spawn_blocking(move || {
                other_counter_clone.fetch_add(1, Ordering::SeqCst);
-                let () = rx_clone2.recv().unwrap();
+                let _ = rx_clone2.recv();
            });
            other_futures.push(other_fut);
        }

        // We execute 100 futures.
-        for i in 0..100 {
+        for _ in 0..100 {
            tx.send(()).unwrap();
        }

@@ -226,7 +226,7 @@ mod tests {
        drop(other_futures);

        // We execute 100 futures.
-        for i in 0..100 {
+        for _ in 0..100 {
            tx.send(()).unwrap();
        }

--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -338,14 +338,14 @@ mod tests {
        let mut term = Term::from_field_json_path(field, "attributes.color", false);
        term.append_type_and_str("red");
        assert_eq!(
-            format!("{:?}", term),
+            format!("{term:?}"),
            "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
        );

        let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
        term.append_type_and_fast_value(400i64);
        assert_eq!(
-            format!("{:?}", term),
+            format!("{term:?}"),
            "Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
        );
    }
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -566,7 +566,7 @@ mod tests {
        let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
        let num_paths = 10;
        let paths: Vec<PathBuf> = (0..num_paths)
-            .map(|i| PathBuf::from(&*format!("file_{}", i)))
+            .map(|i| PathBuf::from(&*format!("file_{i}")))
            .collect();
        {
            for path in &paths {
--- a/src/fastfield/facet_reader.rs
+++ b/src/fastfield/facet_reader.rs
@@ -62,7 +62,7 @@ impl FacetReader {

 #[cfg(test)]
 mod tests {
-    use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
+    use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
    use crate::{DocAddress, Index, IndexWriter, TantivyDocument};

    #[test]
@@ -88,7 +88,9 @@ mod tests {
        let doc = searcher
            .doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
            .unwrap();
-        let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
+        let value = doc
+            .get_first(facet_field)
+            .and_then(|v| v.as_value().as_facet());
        assert_eq!(value, None);
    }

--- a/src/index/index.rs
+++ b/src/index/index.rs
@@ -252,9 +252,8 @@ impl IndexBuilder {
                let field_type = entry.field_type().value_type();
                if !supported_field_types.contains(&field_type) {
                    return Err(TantivyError::InvalidArgument(format!(
-                        "Unsupported field type in sort_by_field: {:?}. Supported field types: \
-                         {:?} ",
-                        field_type, supported_field_types,
+                        "Unsupported field type in sort_by_field: {field_type:?}. Supported field \
+                         types: {supported_field_types:?} ",
                    )));
                }
            }
--- a/src/index/segment_reader.rs
+++ b/src/index/segment_reader.rs
@@ -318,14 +318,14 @@ impl SegmentReader {
                        if create_canonical {
                            // Without expand dots enabled dots need to be escaped.
                            let escaped_json_path = json_path.replace('.', "\\.");
-                            let full_path = format!("{}.{}", field_name, escaped_json_path);
+                            let full_path = format!("{field_name}.{escaped_json_path}");
                            let full_path_unescaped = format!("{}.{}", field_name, &json_path);
                            map_to_canonical.insert(full_path_unescaped, full_path.to_string());
                            full_path
                        } else {
                            // With expand dots enabled, we can use '.' instead of '\u{1}'.
                            json_path_sep_to_dot(&mut json_path);
-                            format!("{}.{}", field_name, json_path)
+                            format!("{field_name}.{json_path}")
                        }
                    };
                    indexed_fields.extend(
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -808,7 +808,7 @@ mod tests {
    use proptest::prop_oneof;

    use super::super::operation::UserOperation;
-    use crate::collector::TopDocs;
+    use crate::collector::{Count, TopDocs};
    use crate::directory::error::LockError;
    use crate::error::*;
    use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
@@ -816,7 +816,7 @@ mod tests {
    use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
    use crate::schema::{
        self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
-        TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
+        TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
    };
    use crate::store::DOCSTORE_CACHE_CAPACITY;
    use crate::{
@@ -1572,20 +1572,74 @@ mod tests {
        Ok(())
    }

-    #[derive(Debug, Clone, Copy)]
+    #[derive(Debug, Clone)]
    enum IndexingOp {
-        AddDoc { id: u64 },
-        DeleteDoc { id: u64 },
-        DeleteDocQuery { id: u64 },
+        AddMultipleDoc {
+            id: u64,
+            num_docs: u64,
+            value: IndexValue,
+        },
+        AddDoc {
+            id: u64,
+            value: IndexValue,
+        },
+        DeleteDoc {
+            id: u64,
+        },
+        DeleteDocQuery {
+            id: u64,
+        },
        Commit,
        Merge,
    }
+    impl IndexingOp {
+        fn add(id: u64) -> Self {
+            IndexingOp::AddDoc {
+                id,
+                value: IndexValue::F64(id as f64),
+            }
+        }
+    }
+
+    use serde::Serialize;
+    #[derive(Debug, Clone, Serialize)]
+    #[serde(untagged)]
+    enum IndexValue {
+        Str(String),
+        F64(f64),
+        U64(u64),
+        I64(i64),
+    }
+    impl Default for IndexValue {
+        fn default() -> Self {
+            IndexValue::F64(0.0)
+        }
+    }
+
+    fn value_strategy() -> impl Strategy<Value = IndexValue> {
+        prop_oneof![
+            any::<f64>().prop_map(IndexValue::F64),
+            any::<u64>().prop_map(IndexValue::U64),
+            any::<i64>().prop_map(IndexValue::I64),
+            any::<String>().prop_map(IndexValue::Str),
+        ]
+    }

    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
        prop_oneof![
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            (0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            (0u64..20u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            ((0u64..20u64), (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1595,7 +1649,17 @@ mod tests {
        prop_oneof![
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            50 => (0u64..100u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1604,19 +1668,27 @@ mod tests {
    fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
        let mut existing_ids = HashMap::new();
        let mut deleted_ids = HashSet::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    *existing_ids.entry(id).or_insert(0) += 1;
-                    deleted_ids.remove(&id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    *existing_ids.entry(*id).or_insert(0) += 1;
+                    deleted_ids.remove(id);
+                }
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value: _,
+                } => {
+                    *existing_ids.entry(*id).or_insert(0) += num_docs;
+                    deleted_ids.remove(id);
                }
                IndexingOp::DeleteDoc { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                IndexingOp::DeleteDocQuery { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                _ => {}
            }
@@ -1626,16 +1698,19 @@ mod tests {

    fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
        let mut id_list = Vec::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    id_list.push(id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    id_list.push(*id);
+                }
+                IndexingOp::AddMultipleDoc { id, .. } => {
+                    id_list.push(*id);
                }
                IndexingOp::DeleteDoc { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                IndexingOp::DeleteDocQuery { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                _ => {}
            }
@@ -1716,42 +1791,59 @@ mod tests {

        let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);

-        for &op in ops {
-            match op {
-                IndexingOp::AddDoc { id } => {
-                    let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
-                    let ip = ip_from_id(id);
-
-                    if !id_is_full_doc(id) {
-                        // every 3rd doc has no ip field
-                        index_writer.add_document(doc!(
-                            id_field=>id,
-                        ))?;
-                    } else {
-                        let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
-                        index_writer.add_document(doc!(id_field=>id,
-                                json_field=>json,
-                                bytes_field => id.to_le_bytes().as_slice(),
-                                id_opt_field => id,
-                                ip_field => ip,
-                                ips_field => ip,
-                                ips_field => ip,
-                                multi_numbers=> id,
-                                multi_numbers => id,
-                                bool_field => (id % 2u64) != 0,
-                                i64_field => id as i64,
-                                f64_field => id as f64,
-                                date_field => DateTime::from_timestamp_secs(id as i64),
-                                multi_bools => (id % 2u64) != 0,
-                                multi_bools => (id % 2u64) == 0,
-                                text_field => id.to_string(),
-                                facet_field => facet,
-                                large_text_field => LOREM,
-                                multi_text_fields => multi_text_field_text1,
-                                multi_text_fields => multi_text_field_text2,
-                                multi_text_fields => multi_text_field_text3,
-                        ))?;
-                    }
+        let add_docs = |index_writer: &mut IndexWriter,
+                        id: u64,
+                        value: IndexValue,
+                        num: u64|
+         -> crate::Result<()> {
+            let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
+            let ip = ip_from_id(id);
+            let doc = if !id_is_full_doc(id) {
+                // every 3rd doc has no ip field
+                doc!(
+                    id_field=>id,
+                )
+            } else {
+                let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
+                doc!(id_field=>id,
+                        json_field=>json,
+                        bytes_field => id.to_le_bytes().as_slice(),
+                        id_opt_field => id,
+                        ip_field => ip,
+                        ips_field => ip,
+                        ips_field => ip,
+                        multi_numbers=> id,
+                        multi_numbers => id,
+                        bool_field => (id % 2u64) != 0,
+                        i64_field => id as i64,
+                        f64_field => id as f64,
+                        date_field => DateTime::from_timestamp_secs(id as i64),
+                        multi_bools => (id % 2u64) != 0,
+                        multi_bools => (id % 2u64) == 0,
+                        text_field => id.to_string(),
+                        facet_field => facet,
+                        large_text_field => LOREM,
+                        multi_text_fields => multi_text_field_text1,
+                        multi_text_fields => multi_text_field_text2,
+                        multi_text_fields => multi_text_field_text3,
+                )
+            };
+            for _ in 0..num {
+                index_writer.add_document(doc.clone())?;
+            }
+            Ok(())
+        };
+        for op in ops {
+            match op.clone() {
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value,
+                } => {
+                    add_docs(&mut index_writer, id, value, num_docs)?;
+                }
+                IndexingOp::AddDoc { id, value } => {
+                    add_docs(&mut index_writer, id, value, 1)?;
                }
                IndexingOp::DeleteDoc { id } => {
                    index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1979,7 +2071,13 @@ mod tests {
                .unwrap();
            // test store iterator
            for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
-                let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
+                let id = doc
+                    .unwrap()
+                    .get_first(id_field)
+                    .unwrap()
+                    .as_value()
+                    .as_u64()
+                    .unwrap();
                assert!(expected_ids_and_num_occurrences.contains_key(&id));
            }
            // test store random access
@@ -2026,18 +2124,22 @@ mod tests {

            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
        };
+        let count_search = |term: &str, field| {
+            let query = QueryParser::for_index(&index, vec![field])
+                .parse_query(term)
+                .unwrap();
+            searcher.search(&query, &Count).unwrap()
+        };

-        let do_search2 = |term: Term| {
+        let count_search2 = |term: Term| {
            let query = TermQuery::new(term, IndexRecordOption::Basic);
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
-
-            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
+            searcher.search(&query, &Count).unwrap()
        };

        for (id, count) in &expected_ids_and_num_occurrences {
+            // skip expensive queries
            let (existing_id, count) = (*id, *count);
-            let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
+            let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
            assert_eq!(get_num_hits(id_field), count);
            if !id_is_full_doc(existing_id) {
                continue;
@@ -2047,29 +2149,31 @@ mod tests {
            assert_eq!(get_num_hits(f64_field), count);

            // Test multi text
-            assert_eq!(
-                do_search("\"test1 test2\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
-            assert_eq!(
-                do_search("\"test2 test3\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
+            if num_docs_with_values < 1000 {
+                assert_eq!(
+                    do_search("\"test1 test2\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+                assert_eq!(
+                    do_search("\"test2 test3\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+            }

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test date
            let term = Term::from_field_date(
                date_field,
                DateTime::from_timestamp_secs(existing_id as i64),
            );
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);
        }
        for deleted_id in deleted_ids {
            let assert_field = |field| {
-                assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0);
+                assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
            };
            assert_field(text_field);
            assert_field(f64_field);
@@ -2078,12 +2182,12 @@ mod tests {

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);

            // Test date
            let term =
                Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);
        }
        // search ip address
        //
@@ -2092,13 +2196,13 @@ mod tests {
            if !id_is_full_doc(existing_id) {
                continue;
            }
-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
            // Test incoming ip as ipv6
            assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);

            let term = Term::from_field_ip_addr(ip_field, ip_addr);
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test incoming ip as ipv4
            if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2115,7 +2219,7 @@ mod tests {
        if !sample.is_empty() {
            let (left_sample, right_sample) = sample.split_at(sample.len() / 2);

-            let expected_count = |sample: &[(&u64, &u64)]| {
+            let calc_expected_count = |sample: &[(&u64, &u64)]| {
                sample
                    .iter()
                    .filter(|(id, _)| id_is_full_doc(**id))
@@ -2131,18 +2235,17 @@ mod tests {
            }

            // Query first half
-            if !left_sample.is_empty() {
-                let expected_count = expected_count(left_sample);
-
+            let expected_count = calc_expected_count(left_sample);
+            if !left_sample.is_empty() && expected_count < 1000 {
                let start_range = *left_sample[0].0;
                let end_range = *left_sample.last().unwrap().0;
                let query = gen_query_inclusive("id_opt", start_range, end_range);
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", "*", ip2);
@@ -2154,19 +2257,19 @@ mod tests {
                assert_eq!(do_search_ip_field(&query), expected_count);
            }
            // Query second half
-            if !right_sample.is_empty() {
-                let expected_count = expected_count(right_sample);
+            let expected_count = calc_expected_count(right_sample);
+            if !right_sample.is_empty() && expected_count < 1000 {
                let start_range = *right_sample[0].0;
                let end_range = *right_sample.last().unwrap().0;
                // Range query on id opt field
                let query =
                    gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", ip1, "*");
@@ -2191,7 +2294,7 @@ mod tests {
            };
            let ip = ip_from_id(existing_id);

-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            // Range query on single value field
            let query = gen_query_inclusive("ip", ip, ip);
            assert_eq!(do_search_ip_field(&query), count);
@@ -2251,7 +2354,7 @@ mod tests {

    #[test]
    fn test_fast_field_range() {
-        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
+        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
        assert!(test_operation_strategy(&ops, false, true).is_ok());
    }

@@ -2259,8 +2362,8 @@ mod tests {
    fn test_sort_index_on_opt_field_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 81 },
-                IndexingOp::AddDoc { id: 70 },
+                IndexingOp::add(81),
+                IndexingOp::add(70),
                IndexingOp::DeleteDoc { id: 70 }
            ],
            true,
@@ -2269,14 +2372,45 @@ mod tests {
        .is_ok());
    }

+    #[test]
+    fn test_simple_multiple_doc() {
+        assert!(test_operation_strategy(
+            &[
+                IndexingOp::AddMultipleDoc {
+                    id: 7,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 92,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 30,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 33,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+            ],
+            true,
+            false
+        )
+        .is_ok());
+    }
+
    #[test]
    fn test_ip_range_query_multivalue_bug() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 2 },
+                IndexingOp::add(2),
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 1 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::Merge
            ],
@@ -2290,11 +2424,11 @@ mod tests {
    fn test_ff_num_ips_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(13),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::DeleteDocQuery { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
                IndexingOp::Commit,
            ],
            false,
@@ -2306,7 +2440,7 @@ mod tests {
    #[test]
    fn test_minimal_sort_force_end_merge() {
        assert!(test_operation_strategy(
-            &[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },],
+            &[IndexingOp::add(23), IndexingOp::add(13),],
            false,
            false
        )
@@ -2367,8 +2501,8 @@ mod tests {
    fn test_minimal_sort_force_end_merge_with_delete() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            true,
@@ -2381,8 +2515,8 @@ mod tests {
    fn test_minimal_no_sort_no_force_end_merge() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            false,
@@ -2393,7 +2527,7 @@ mod tests {

    #[test]
    fn test_minimal_sort_merge() {
-        assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok());
+        assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
    }

    use proptest::prelude::*;
@@ -2489,14 +2623,14 @@ mod tests {
    fn test_delete_bug_reproduction_ip_addr() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 1 },
-            AddDoc { id: 2 },
+            IndexingOp::add(1),
+            IndexingOp::add(2),
            Commit,
-            AddDoc { id: 3 },
+            IndexingOp::add(3),
            DeleteDoc { id: 1 },
            Commit,
            Merge,
-            AddDoc { id: 4 },
+            IndexingOp::add(4),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2505,7 +2639,13 @@ mod tests {
    #[test]
    fn test_merge_regression_1() {
        use IndexingOp::*;
-        let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
+        let ops = &[
+            IndexingOp::add(15),
+            Commit,
+            IndexingOp::add(9),
+            Commit,
+            Merge,
+        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
    }

@@ -2513,9 +2653,9 @@ mod tests {
    fn test_range_query_bug_1() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 9 },
-            AddDoc { id: 0 },
-            AddDoc { id: 13 },
+            IndexingOp::add(9),
+            IndexingOp::add(0),
+            IndexingOp::add(13),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2523,12 +2663,11 @@ mod tests {

    #[test]
    fn test_range_query_bug_2() {
-        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 3 },
-            AddDoc { id: 6 },
-            AddDoc { id: 9 },
-            AddDoc { id: 10 },
+            IndexingOp::add(3),
+            IndexingOp::add(6),
+            IndexingOp::add(9),
+            IndexingOp::add(10),
        ];
        test_operation_strategy(&ops[..], false, false).unwrap();
    }
@@ -2550,7 +2689,7 @@ mod tests {
        assert!(test_operation_strategy(
            &[
                IndexingOp::DeleteDoc { id: 0 },
-                IndexingOp::AddDoc { id: 6 },
+                IndexingOp::add(6),
                IndexingOp::DeleteDocQuery { id: 11 },
                IndexingOp::Commit,
                IndexingOp::Merge,
@@ -2567,10 +2706,13 @@ mod tests {
    fn test_bug_1617_2() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::AddDoc {
+                    id: 13,
+                    value: Default::default()
+                },
                IndexingOp::DeleteDoc { id: 13 },
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 30 },
+                IndexingOp::add(30),
                IndexingOp::Commit,
                IndexingOp::Merge,
            ],
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -696,7 +696,7 @@ impl IndexMerger {
            for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
                let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
                if let Some(doc_bytes_res) = doc_bytes_it.next() {
-                    let (_, doc_bytes) = doc_bytes_res?;
+                    let doc_bytes = doc_bytes_res?;
                    store_writer.store_bytes(&doc_bytes)?;
                } else {
                    return Err(DataCorruption::comment_only(format!(
@@ -728,7 +728,7 @@ impl IndexMerger {
                    || store_reader.decompressor() != store_writer.compressor().into()
                {
                    for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
-                        let (_, doc_bytes) = doc_bytes_res?;
+                        let doc_bytes = doc_bytes_res?;
                        store_writer.store_bytes(&doc_bytes)?;
                    }
                } else {
@@ -787,6 +787,8 @@ impl IndexMerger {
 mod tests {

    use columnar::Column;
+    use proptest::prop_oneof;
+    use proptest::strategy::Strategy;
    use schema::FAST;

    use crate::collector::tests::{
@@ -794,10 +796,11 @@ mod tests {
    };
    use crate::collector::{Count, FacetCollector};
    use crate::index::{Index, SegmentId};
+    use crate::indexer::NoMergePolicy;
    use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
    use crate::schema::{
        Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
-        TextFieldIndexing, INDEXED, TEXT,
+        TextFieldIndexing, Value, INDEXED, TEXT,
    };
    use crate::time::OffsetDateTime;
    use crate::{
@@ -909,15 +912,24 @@ mod tests {
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("af b")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("a b c")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("a b c d")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
@@ -1522,6 +1534,112 @@ mod tests {
        Ok(())
    }

+    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
+    enum IndexingOp {
+        ZeroVal,
+        OneVal { val: u64 },
+        TwoVal { val: u64 },
+        Commit,
+    }
+
+    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
+        prop_oneof![
+            (0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
+            (0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
+            (0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
+            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
+        ]
+    }
+
+    use proptest::prelude::*;
+    proptest! {
+        #[test]
+        fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
+            assert!(test_merge_int_fields(&ops[..]).is_ok());
+        }
+    }
+    fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
+        if ops.iter().all(|op| *op == IndexingOp::Commit) {
+            return Ok(());
+        }
+        let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
+            .iter()
+            .filter(|op| *op != &IndexingOp::Commit)
+            .map(|op| match op {
+                IndexingOp::ZeroVal => vec![],
+                IndexingOp::OneVal { val } => vec![*val],
+                IndexingOp::TwoVal { val } => vec![*val, *val],
+                IndexingOp::Commit => unreachable!(),
+            })
+            .enumerate()
+            .map(|(id, val)| (id as u32, val))
+            .collect();
+
+        let mut schema_builder = schema::Schema::builder();
+        let int_options = NumericOptions::default().set_fast().set_indexed();
+        let int_field = schema_builder.add_u64_field("intvals", int_options);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut index_writer = index.writer_for_tests()?;
+            index_writer.set_merge_policy(Box::new(NoMergePolicy));
+            let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
+                let mut doc = TantivyDocument::default();
+                for &val in int_vals {
+                    doc.add_u64(int_field, val);
+                }
+                index_writer.add_document(doc).unwrap();
+            };
+
+            for op in ops {
+                match op {
+                    IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
+                    IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
+                    IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
+                    IndexingOp::Commit => {
+                        index_writer.commit().expect("commit failed");
+                    }
+                }
+            }
+            index_writer.commit().expect("commit failed");
+        }
+        {
+            let mut segment_ids = index.searchable_segment_ids()?;
+            segment_ids.sort();
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.merge(&segment_ids).wait()?;
+            index_writer.wait_merging_threads()?;
+        }
+        let reader = index.reader()?;
+        reader.reload()?;
+
+        let mut vals: Vec<u64> = Vec::new();
+        let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
+            vals.clear();
+            vals.extend(col.values_for_doc(doc));
+            assert_eq!(&vals[..], expected);
+        };
+
+        let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
+            for (doc_id, vals) in column_expected.iter() {
+                test_vals(col, *doc_id, vals);
+            }
+        };
+
+        {
+            let searcher = reader.searcher();
+            let segment = searcher.segment_reader(0u32);
+            let col = segment
+                .fast_fields()
+                .column_opt::<u64>("intvals")
+                .unwrap()
+                .unwrap();
+
+            test_col(&col, &expected_doc_and_vals);
+        }
+
+        Ok(())
+    }
+
    #[test]
    fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
        let mut schema_builder = schema::Schema::builder();
--- a/src/indexer/merger_sorted_index_test.rs
+++ b/src/indexer/merger_sorted_index_test.rs
@@ -7,7 +7,7 @@ mod tests {
    use crate::query::QueryParser;
    use crate::schema::{
        self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
-        TextFieldIndexing, TextOptions,
+        TextFieldIndexing, TextOptions, Value,
    };
    use crate::{
        DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
@@ -280,13 +280,16 @@ mod tests {
                .doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
                .unwrap();
            assert_eq!(
-                doc.get_first(my_text_field).unwrap().as_str(),
+                doc.get_first(my_text_field).unwrap().as_value().as_str(),
                Some("blubber")
            );
            let doc = searcher
                .doc::<TantivyDocument>(DocAddress::new(0, 0))
                .unwrap();
-            assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
+            assert_eq!(
+                doc.get_first(int_field).unwrap().as_value().as_u64(),
+                Some(1000)
+            );
        }
    }

--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -216,7 +216,7 @@ mod tests_mmap {
        let test_query = |query_str: &str| {
            let query = parse_query.parse_query(query_str).unwrap();
            let num_docs = searcher.search(&query, &Count).unwrap();
-            assert_eq!(num_docs, 1, "{}", query_str);
+            assert_eq!(num_docs, 1, "{query_str}");
        };
        test_query(format!("json.{field_name_out}:test1").as_str());
        test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -590,10 +590,10 @@ mod tests_mmap {
        let query_parser = QueryParser::for_index(&index, vec![]);
        // Test if field name can be queried
        for (indexed_field, val) in fields_and_vals.iter() {
-            let query_str = &format!("{}:{}", indexed_field, val);
+            let query_str = &format!("{indexed_field}:{val}");
            let query = query_parser.parse_query(query_str).unwrap();
            let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
-            assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
+            assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
        }
        // Test if field name can be used for aggregation
        for (field_name, val) in fields_and_vals.iter() {
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -202,9 +202,8 @@ impl SegmentWriter {
            match field_entry.field_type() {
                FieldType::Facet(_) => {
                    let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
                        let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
                }
                FieldType::Str(_) => {
                    let mut indexing_position = IndexingPosition::default();
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let mut token_stream = if let Some(text) = value.as_str() {
                            let text_analyzer =
                                &mut self.per_field_text_analyzers[field.field_id() as usize];
                            text_analyzer.token_stream(text)
-                        } else if let Some(tok_str) = value.as_pre_tokenized_text() {
+                        } else if let Some(tok_str) = value.into_pre_tokenized_text() {
                            BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
                        } else {
                            continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
                }
                FieldType::U64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
                }
                FieldType::Date(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value_access = value_access as D::Value<'_>;
-                        let value = value_access.as_value();
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
                }
                FieldType::I64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
                }
                FieldType::F64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
                        term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
                }
                FieldType::Bool(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
                        term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
                }
                FieldType::Bytes(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
                        term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
                }
                FieldType::IpAddr(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -500,8 +487,8 @@ mod tests {
    use crate::postings::{Postings, TermInfo};
    use crate::query::{PhraseQuery, QueryParser};
    use crate::schema::{
-        Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
-        STRING, TEXT,
+        Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
+        STORED, STRING, TEXT,
    };
    use crate::store::{Compressor, StoreReader, StoreWriter};
    use crate::time::format_description::well_known::Rfc3339;
@@ -555,9 +542,12 @@ mod tests {
        let doc = reader.get::<TantivyDocument>(0).unwrap();

        assert_eq!(doc.field_values().count(), 2);
-        assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A"));
        assert_eq!(
-            doc.get_all(text_field).nth(1).unwrap().as_str(),
+            doc.get_all(text_field).next().unwrap().as_value().as_str(),
+            Some("A")
+        );
+        assert_eq!(
+            doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
            Some("title")
        );
    }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -397,16 +397,20 @@ pub mod tests {
    #[macro_export]
    macro_rules! assert_nearly_equals {
        ($left:expr, $right:expr) => {{
-            match (&$left, &$right) {
-                (left_val, right_val) => {
+            assert_nearly_equals!($left, $right, 0.0005);
+        }};
+        ($left:expr, $right:expr, $epsilon:expr) => {{
+            match (&$left, &$right, &$epsilon) {
+                (left_val, right_val, epsilon_val) => {
                    let diff = (left_val - right_val).abs();
-                    let add = left_val.abs() + right_val.abs();
-                    if diff > 0.0005 * add {
+
+                    if diff > *epsilon_val {
                        panic!(
-                            r#"assertion failed: `(left ~= right)`
-  left: `{:?}`,
- right: `{:?}`"#,
-                            &*left_val, &*right_val
+                            r#"assertion failed: `abs(left-right)>epsilon`
+    left: `{:?}`,
+    right: `{:?}`,
+    epsilon: `{:?}`"#,
+                            &*left_val, &*right_val, &*epsilon_val
                        )
                    }
                }
--- a/src/query/fuzzy_query.rs
+++ b/src/query/fuzzy_query.rs
@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
                if json_path_type != Type::Str {
                    return Err(InvalidArgument(format!(
                        "The fuzzy term query requires a string path type for a json term. Found \
-                         {:?}",
-                        json_path_type
+                         {json_path_type:?}"
                    )));
                }
            }
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -2,7 +2,7 @@ use std::fmt;
 use std::ops::Bound;

 use crate::query::Occur;
-use crate::schema::{Field, Term, Type};
+use crate::schema::{Term, Type};
 use crate::Score;

 #[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
        upper: Bound<Term>,
    },
    Set {
-        field: Field,
-        value_type: Type,
        elements: Vec<Term>,
    },
    All,
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -832,17 +832,11 @@ impl QueryParser {
                let (field, json_path) = try_tuple!(self
                    .split_full_path(&full_path)
                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
-                let field_entry = self.schema.get_field_entry(field);
-                let value_type = field_entry.field_type().value_type();
                let (elements, errors) = elements
                    .into_iter()
                    .map(|element| self.compute_boundary_term(field, json_path, &element))
                    .partition_result();
-                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
-                    elements,
-                    field,
-                    value_type,
-                }));
+                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
                (Some(logical_ast), errors)
            }
            UserInputLeaf::Exists { .. } => (
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -185,7 +185,7 @@ mod test {
            Err(crate::TantivyError::InvalidArgument(msg)) => {
                assert!(msg.contains("error: unclosed group"))
            }
-            res => panic!("unexpected result: {:?}", res),
+            res => panic!("unexpected result: {res:?}"),
        }
    }
 }
--- a/src/schema/document/default_document.rs
+++ b/src/schema/document/default_document.rs
@@ -157,29 +157,24 @@ impl CompactDoc {
    }

    /// field_values accessor
-    pub fn field_values(
-        &self,
-    ) -> impl Iterator<Item = (Field, ReferenceValue<'_, CompactDocValue<'_>>)> {
+    pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
        self.field_values.iter().map(|field_val| {
            let field = Field::from_field_id(field_val.field as u32);
-            let val = self.extract_value(field_val.value_addr).unwrap();
+            let val = self.get_compact_doc_value(field_val.value_addr);
            (field, val)
        })
    }

    /// Returns all of the `ReferenceValue`s associated the given field
-    pub fn get_all(
-        &self,
-        field: Field,
-    ) -> impl Iterator<Item = ReferenceValue<'_, CompactDocValue<'_>>> + '_ {
+    pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
        self.field_values
            .iter()
            .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
-            .map(|val| self.extract_value(val.value_addr).unwrap())
+            .map(|val| self.get_compact_doc_value(val.value_addr))
    }

    /// Returns the first `ReferenceValue` associated the given field
-    pub fn get_first(&self, field: Field) -> Option<ReferenceValue<'_, CompactDocValue<'_>>> {
+    pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
        self.get_all(field).next()
    }

@@ -299,58 +294,11 @@ impl CompactDoc {
        }
    }

-    fn extract_value(
-        &self,
-        ref_value: ValueAddr,
-    ) -> io::Result<ReferenceValue<'_, CompactDocValue<'_>>> {
-        match ref_value.type_id {
-            ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
-            ValueType::Str => {
-                let str_ref = self.extract_str(ref_value.val_addr);
-                Ok(ReferenceValueLeaf::Str(str_ref).into())
-            }
-            ValueType::Facet => {
-                let str_ref = self.extract_str(ref_value.val_addr);
-                Ok(ReferenceValueLeaf::Facet(str_ref).into())
-            }
-            ValueType::Bytes => {
-                let data = self.extract_bytes(ref_value.val_addr);
-                Ok(ReferenceValueLeaf::Bytes(data).into())
-            }
-            ValueType::U64 => self
-                .read_from::<u64>(ref_value.val_addr)
-                .map(ReferenceValueLeaf::U64)
-                .map(Into::into),
-            ValueType::I64 => self
-                .read_from::<i64>(ref_value.val_addr)
-                .map(ReferenceValueLeaf::I64)
-                .map(Into::into),
-            ValueType::F64 => self
-                .read_from::<f64>(ref_value.val_addr)
-                .map(ReferenceValueLeaf::F64)
-                .map(Into::into),
-            ValueType::Bool => Ok(ReferenceValueLeaf::Bool(ref_value.val_addr != 0).into()),
-            ValueType::Date => self
-                .read_from::<i64>(ref_value.val_addr)
-                .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
-                .map(Into::into),
-            ValueType::IpAddr => self
-                .read_from::<u128>(ref_value.val_addr)
-                .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
-                .map(Into::into),
-            ValueType::PreTokStr => self
-                .read_from::<PreTokenizedString>(ref_value.val_addr)
-                .map(Into::into)
-                .map(ReferenceValueLeaf::PreTokStr)
-                .map(Into::into),
-            ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
-                self,
-                ref_value.val_addr,
-            )?)),
-            ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
-                self,
-                ref_value.val_addr,
-            )?)),
+    /// Get CompactDocValue for address
+    fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
+        CompactDocValue {
+            container: self,
+            value_addr,
        }
    }

@@ -410,7 +358,7 @@ impl PartialEq for CompactDoc {
        let convert_to_comparable_map = |doc: &CompactDoc| {
            let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
            for field_value in doc.field_values.iter() {
-                let value: OwnedValue = doc.extract_value(field_value.value_addr).unwrap().into();
+                let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
                let value = serde_json::to_string(&value).unwrap();
                field_value_set
                    .entry(Field::from_field_id(field_value.field as u32))
@@ -444,7 +392,19 @@ impl DocumentDeserialize for CompactDoc {
 #[derive(Debug, Clone, Copy)]
 pub struct CompactDocValue<'a> {
    container: &'a CompactDoc,
-    value: ValueAddr,
+    value_addr: ValueAddr,
+}
+impl PartialEq for CompactDocValue<'_> {
+    fn eq(&self, other: &Self) -> bool {
+        let value1: OwnedValue = (*self).into();
+        let value2: OwnedValue = (*other).into();
+        value1 == value2
+    }
+}
+impl<'a> From<CompactDocValue<'a>> for OwnedValue {
+    fn from(value: CompactDocValue) -> Self {
+        value.as_value().into()
+    }
 }
 impl<'a> Value<'a> for CompactDocValue<'a> {
    type ArrayIter = CompactDocArrayIter<'a>;
@@ -452,7 +412,67 @@ impl<'a> Value<'a> for CompactDocValue<'a> {
    type ObjectIter = CompactDocObjectIter<'a>;

    fn as_value(&self) -> ReferenceValue<'a, Self> {
-        self.container.extract_value(self.value).unwrap()
+        self.get_ref_value().unwrap()
+    }
+}
+impl<'a> CompactDocValue<'a> {
+    fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
+        let addr = self.value_addr.val_addr;
+        match self.value_addr.type_id {
+            ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
+            ValueType::Str => {
+                let str_ref = self.container.extract_str(addr);
+                Ok(ReferenceValueLeaf::Str(str_ref).into())
+            }
+            ValueType::Facet => {
+                let str_ref = self.container.extract_str(addr);
+                Ok(ReferenceValueLeaf::Facet(str_ref).into())
+            }
+            ValueType::Bytes => {
+                let data = self.container.extract_bytes(addr);
+                Ok(ReferenceValueLeaf::Bytes(data).into())
+            }
+            ValueType::U64 => self
+                .container
+                .read_from::<u64>(addr)
+                .map(ReferenceValueLeaf::U64)
+                .map(Into::into),
+            ValueType::I64 => self
+                .container
+                .read_from::<i64>(addr)
+                .map(ReferenceValueLeaf::I64)
+                .map(Into::into),
+            ValueType::F64 => self
+                .container
+                .read_from::<f64>(addr)
+                .map(ReferenceValueLeaf::F64)
+                .map(Into::into),
+            ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
+            ValueType::Date => self
+                .container
+                .read_from::<i64>(addr)
+                .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
+                .map(Into::into),
+            ValueType::IpAddr => self
+                .container
+                .read_from::<u128>(addr)
+                .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
+                .map(Into::into),
+            ValueType::PreTokStr => self
+                .container
+                .read_from::<PreTokenizedString>(addr)
+                .map(Into::into)
+                .map(ReferenceValueLeaf::PreTokStr)
+                .map(Into::into),
+            ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
+                self.container,
+                addr,
+            )?)),
+            ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
+                self.container,
+                addr,
+            )?)),
+        }
    }
 }

@@ -537,7 +557,7 @@ impl BinarySerializable for ValueType {
        } else {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
-                format!("Invalid value type id: {}", num),
+                format!("Invalid value type id: {num}"),
            ));
        };
        Ok(type_id)
@@ -601,9 +621,9 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
        let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
        let value = CompactDocValue {
            container: self.container,
-            value,
+            value_addr: value,
        };
-        return Some((key, value));
+        Some((key, value))
    }
 }

@@ -635,9 +655,9 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
        let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
        let value = CompactDocValue {
            container: self.container,
-            value,
+            value_addr: value,
        };
-        return Some(value);
+        Some(value)
    }
 }

@@ -668,7 +688,7 @@ impl<'a> Iterator for FieldValueIterRef<'a> {
                Field::from_field_id(field_value.field as u32),
                CompactDocValue::<'a> {
                    container: self.container,
-                    value: field_value.value_addr,
+                    value_addr: field_value.value_addr,
                },
            )
        })
--- a/src/schema/document/se.rs
+++ b/src/schema/document/se.rs
@@ -58,9 +58,8 @@ where W: Write
            return Err(io::Error::new(
                io::ErrorKind::Other,
                format!(
-                    "Unexpected number of entries written to serializer, expected {} entries, got \
-                     {} entries",
-                    num_field_values, actual_length,
+                    "Unexpected number of entries written to serializer, expected \
+                     {num_field_values} entries, got {actual_length} entries",
                ),
            ));
        }
--- a/src/schema/document/value.rs
+++ b/src/schema/document/value.rs
@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
    /// Returns the field value represented by an enum which borrows it's data.
    fn as_value(&self) -> ReferenceValue<'a, Self>;

-    #[inline]
-    /// Returns if the value is `null` or not.
-    fn is_null(&self) -> bool {
-        matches!(
-            self.as_value(),
-            ReferenceValue::Leaf(ReferenceValueLeaf::Null)
-        )
-    }
-
    #[inline]
    /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
    fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
            None
        }
    }
-
-    #[inline]
-    /// Returns true if the Value is an array.
-    fn is_array(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
-
-    #[inline]
-    /// Returns true if the Value is an object.
-    fn is_object(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
 }

 /// A enum representing a leaf value for tantivy to index.
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -659,9 +659,9 @@ mod tests {
        let schema = schema_builder.build();
        let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
        let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
-        let date = doc.get_first(date_field).unwrap();
+        let date = OwnedValue::from(doc.get_first(date_field).unwrap());
        // Time zone is converted to UTC
-        assert_eq!("Leaf(Date(2019-10-12T05:20:50.52Z))", format!("{date:?}"));
+        assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
    }

    #[test]
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -60,7 +60,7 @@ pub mod tests {
    use crate::directory::{Directory, RamDirectory, WritePtr};
    use crate::fastfield::AliveBitSet;
    use crate::schema::{
-        self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
+        self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
    };
    use crate::{Index, IndexWriter, Term};

@@ -122,6 +122,7 @@ pub mod tests {
                    .get::<TantivyDocument>(i)?
                    .get_first(field_title)
                    .unwrap()
+                    .as_value()
                    .as_str()
                    .unwrap(),
                format!("Doc {i}")
@@ -133,6 +134,7 @@ pub mod tests {
            let title_content = doc
                .get_first(field_title)
                .unwrap()
+                .as_value()
                .as_str()
                .unwrap()
                .to_string();
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -241,23 +241,12 @@ impl StoreReader {
        &'b self,
        alive_bitset: Option<&'a AliveBitSet>,
    ) -> impl Iterator<Item = crate::Result<D>> + 'b {
-        self.enumerate(alive_bitset)
-            .map(|res| res.map(|(_, doc)| doc))
-    }
-
-    /// A variant of [`iter`][Self::iter] which also yields document ID.
-    pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
-        &'b self,
-        alive_bitset: Option<&'a AliveBitSet>,
-    ) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
        self.iter_raw(alive_bitset).map(|doc_bytes_res| {
-            let (doc_id, mut doc_bytes) = doc_bytes_res?;
+            let mut doc_bytes = doc_bytes_res?;

            let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
                .map_err(crate::TantivyError::from)?;
-            let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?;
-
-            Ok((doc_id, doc))
+            D::deserialize(deserializer).map_err(crate::TantivyError::from)
        })
    }

@@ -267,7 +256,7 @@ impl StoreReader {
    pub(crate) fn iter_raw<'a: 'b, 'b>(
        &'b self,
        alive_bitset: Option<&'a AliveBitSet>,
-    ) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b {
+    ) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
        let last_doc_id = self
            .block_checkpoints()
            .last()
@@ -295,14 +284,14 @@ impl StoreReader {

                let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
                let res = if alive {
-                    Some((doc_id, curr_block.clone(), doc_pos))
+                    Some((curr_block.clone(), doc_pos))
                } else {
                    None
                };
                doc_pos += 1;
                res
            })
-            .map(move |(doc_id, block, doc_pos)| {
+            .map(move |(block, doc_pos)| {
                let block = block
                    .ok_or_else(|| {
                        DataCorruption::comment_only(
@@ -315,7 +304,7 @@ impl StoreReader {
                    })?;

                let range = block_read_index(&block, doc_pos)?;
-                Ok((doc_id, block.slice(range)))
+                Ok(block.slice(range))
            })
    }

@@ -414,7 +403,7 @@ mod tests {

    use super::*;
    use crate::directory::RamDirectory;
-    use crate::schema::{Field, TantivyDocument};
+    use crate::schema::{Field, TantivyDocument, Value};
    use crate::store::tests::write_lorem_ipsum_store;
    use crate::store::Compressor;
    use crate::Directory;
@@ -422,7 +411,7 @@ mod tests {
    const BLOCK_SIZE: usize = 16_384;

    fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
-        doc.get_first(*field).and_then(|f| f.as_str())
+        doc.get_first(*field).and_then(|f| f.as_value().as_str())
    }

    #[test]
--- a/src/termdict/fst_termdict/termdict.rs
+++ b/src/termdict/fst_termdict/termdict.rs
@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
    let fst = Fst::new(bytes).map_err(|err| {
        io::Error::new(
            io::ErrorKind::InvalidData,
-            format!("Fst data is corrupted: {:?}", err),
+            format!("Fst data is corrupted: {err:?}"),
        )
    })?;
    Ok(tantivy_fst::Map::from(fst))
--- a/src/termdict/tests.rs
+++ b/src/termdict/tests.rs
@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
 #[test]
 fn test_term_dictionary_stream() -> crate::Result<()> {
    let ids: Vec<_> = (0u32..10_000u32)
-        .map(|i| (format!("doc{:0>6}", i), i))
+        .map(|i| (format!("doc{i:0>6}"), i))
        .collect();
    let buffer: Vec<u8> = {
        let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
 #[test]
 fn test_stream_range() -> crate::Result<()> {
    let ids: Vec<_> = (0u32..10_000u32)
-        .map(|i| (format!("doc{:0>6}", i), i))
+        .map(|i| (format!("doc{i:0>6}"), i))
        .collect();
    let buffer: Vec<u8> = {
        let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
--- a/src/tokenizer/facet_tokenizer.rs
+++ b/src/tokenizer/facet_tokenizer.rs
@@ -96,7 +96,7 @@ mod tests {
        {
            let mut add_token = |token: &Token| {
                let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
-                tokens.push(format!("{}", facet));
+                tokens.push(format!("{facet}"));
            };
            FacetTokenizer::default()
                .token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
        {
            let mut add_token = |token: &Token| {
                let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
-                tokens.push(format!("{}", facet));
+                tokens.push(format!("{facet}"));
            };
            FacetTokenizer::default()
                .token_stream(facet.encoded_str()) // ok test
Author	SHA1	Message	Date
Pascal Seitz	2ce485b8cc	skip estimate phase for merge multivalue index precompute stats for merge multivalue index + disable Line encoding for multivalue index. That combination allows to skip the first estimation pass. This gives up to 2x on merge performance on multivalue indices. This change may decrease compression as Line is very good compressible for documents, which have a fixed amount of values in each doc. The line codec should be replaced. ``` merge_multi_and_multi Avg: 22.7880ms (-47.15%) Median: 22.5469ms (-47.38%) [22.3691ms .. 25.8392ms] merge_dense_and_dense Avg: 14.4398ms (+2.18%) Median: 14.2465ms (+0.74%) [14.1620ms .. 16.1270ms] merge_sparse_and_sparse Avg: 10.6559ms (+1.10%) Median: 10.6318ms (+0.91%) [10.5527ms .. 11.2848ms] merge_sparse_and_dense Avg: 12.4886ms (+1.52%) Median: 12.4044ms (+0.84%) [12.3261ms .. 13.9439ms] merge_multi_and_dense Avg: 25.6686ms (-45.56%) Median: 25.4851ms (-45.84%) [25.1618ms .. 27.6226ms] merge_multi_and_sparse Avg: 24.3278ms (-47.00%) Median: 24.1917ms (-47.34%) [23.7159ms .. 27.0513ms] ```	2024-06-11 20:22:00 +08:00
PSeitz	c3b92a5412	fix compiler warning, cleanup (#2393 ) fix compiler warning for missing feature flag remove unused variables cleanup unused methods	2024-06-11 16:03:50 +08:00
PSeitz	2f55511064	extend indexwriter proptests (#2342 ) * index random values in proptest * add proptest with multiple docs	2024-06-11 16:02:57 +08:00
trinity-1686a	08b9fc0b31	fix de-escaping too much in query parser (#2427 ) * fix de-escaping too much in query parser	2024-06-10 11:19:01 +02:00
PSeitz	714f363d43	add bench & test for columnar merging (#2428 ) * add merge columnar proptest * add columnar merge benchmark	2024-06-10 16:26:16 +08:00
PSeitz	93ff7365b0	reduce top hits aggregation memory consumption (#2426 ) move request structure out of top hits aggregation collector and use from the passed structure instead full terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 425.9680ms (-21.38%) Median: 415.1097ms (-23.56%) [395.5303ms .. 484.6325ms] dense terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 440.0817ms (-19.68%) Median: 432.2286ms (-21.10%) [403.5632ms .. 497.7541ms] sparse terms_many_with_top_hits Memory: 13.1 MB (-49.31%) Avg: 33.3568ms (-32.19%) Median: 33.0834ms (-31.86%) [32.5126ms .. 35.7397ms] multivalue terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 414.2340ms (-25.44%) Median: 413.4144ms (-25.64%) [403.9919ms .. 430.3170ms]	2024-06-06 22:32:58 +08:00
Adam Reichold	8151925068	Panicking in spawned Rayon tasks will abort the process by default. (#2409 )	2024-06-04 17:04:30 +09:00
dependabot[bot]	b960e40bc8	Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423 ) Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version. - [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases) - [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0) --- updated-dependencies: - dependency-name: sketches-ddsketch dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-06-04 15:50:23 +08:00
giovannicuccu	1095c9b073	Issue 1787 extended stats (#2247 ) * first version of extended stats along with its tests * using IntermediateExtendStats instead of IntermediateStats with all tests passing * Created struct for request and response * first test with extended_stats * kahan summation and tests with approximate equality * version ready for merge * removed approx dependency * refactor for using ExtendedStats only when needed * interim version * refined version with code formatted * refactored a struct * cosmetic refactor * fix after merge * fix format * added extended_stat bench * merge and new benchmark for extended stats * split stat segment collectors * wrapped intermediate extended stat with a box to limit memory usage * Revert "wrapped intermediate extended stat with a box to limit memory usage" This reverts commit `5b4aa9f393`. * some code reformat, commented kahan summation * refactor after review * refactor after code review * fix after incorrectly restoring kahan summation * modifications for code review + bug fix in merge_fruit * refactor assert_nearly_equals macro * update after code review --------- Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>	2024-06-04 14:25:17 +08:00
PSeitz	c0686515a9	update one_shot (#2420 )	2024-05-31 11:07:35 +08:00
trinity-1686a	455156f51c	improve query parser (#2416 ) * support escape sequence in more place and fix bug with singlequoted strings * add query parser test for range query on default field	2024-05-30 17:29:27 +02:00
Meng Zhang	4143d31865	chore: fix build as the rev is gone (#2417 )	2024-05-29 09:49:16 +08:00
Hamir Mahal	0c634adbe1	style: simplify strings with string interpolation (#2412 ) * style: simplify strings with string interpolation * fix: formatting	2024-05-27 09:16:47 +02:00
PSeitz	2e3641c2ae	return CompactDocValue instead of trait (#2410 ) The CompactDocValue is easier to handle than the trait in some cases like comparison and conversion	2024-05-27 07:33:50 +02:00