skip estimate phase for merge multivalue index

precompute stats for merge multivalue index + disable Line encoding for multivalue index. That combination allows to skip the first estimation pass. This gives up to 2x on merge performance on multivalue indices. This change may decrease compression as Line is very good compressible for documents, which have a fixed amount of values in each doc. The line codec should be replaced. ``` merge_multi_and_multi Avg: 22.7880ms (-47.15%) Median: 22.5469ms (-47.38%) [22.3691ms .. 25.8392ms] merge_dense_and_dense Avg: 14.4398ms (+2.18%) Median: 14.2465ms (+0.74%) [14.1620ms .. 16.1270ms] merge_sparse_and_sparse Avg: 10.6559ms (+1.10%) Median: 10.6318ms (+0.91%) [10.5527ms .. 11.2848ms] merge_sparse_and_dense Avg: 12.4886ms (+1.52%) Median: 12.4044ms (+0.84%) [12.3261ms .. 13.9439ms] merge_multi_and_dense Avg: 25.6686ms (-45.56%) Median: 25.4851ms (-45.84%) [25.1618ms .. 27.6226ms] merge_multi_and_sparse Avg: 24.3278ms (-47.00%) Median: 24.1917ms (-47.34%) [23.7159ms .. 27.0513ms] ```
fix compiler warning, cleanup (#2393 )
2026-02-22 15:50:36 +00:00 · 2024-06-11 20:22:00 +08:00 · 2024-06-11 16:03:50 +08:00 · 2024-06-11 16:02:57 +08:00 · 2024-06-10 11:19:01 +02:00 · 2024-06-10 16:26:16 +08:00
71 changed files with 3141 additions and 686 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ rust-version = "1.63"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
-# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
 oneshot = "0.1.7"
 base64 = "0.22.0"
 byteorder = "1.4.3"
@@ -53,7 +52,7 @@ smallvec = "1.8.0"
 rayon = "1.5.2"
 lru = "0.12.0"
 fastdivide = "0.4.0"
-itertools = "0.12.0"
+itertools = "0.13.0"
 measure_time = "0.8.2"
 arc-swap = "1.5.0"

@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
 tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
 common = { version = "0.7", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
-sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
+sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
 futures-util = { version = "0.3.28", optional = true }
 fnv = "1.0.7"

@@ -72,7 +71,7 @@ fnv = "1.0.7"
 winapi = "0.3.9"

 [dev-dependencies]
-binggan = "0.6.2"
+binggan = "0.8.0"
 rand = "0.8.5"
 maplit = "1.0.2"
 matches = "0.1.9"
--- a/benches/agg_bench.rs
+++ b/benches/agg_bench.rs
@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
    register!(group, average_f64);
    register!(group, average_f64_u64);
    register!(group, stats_f64);
+    register!(group, extendedstats_f64);
    register!(group, percentiles_f64);
    register!(group, terms_few);
    register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
    });
    exec_term_with_agg(index, agg_req)
 }
-
+fn extendedstats_f64(index: &Index) {
+    let agg_req = json!({
+        "extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
+    });
+    exec_term_with_agg(index, agg_req)
+}
 fn percentiles_f64(index: &Index) {
    let agg_req = json!({
      "mypercentiles": {
@@ -349,7 +355,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
    let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();

    let many_terms_data = (0..150_000)
-        .map(|num| format!("author{}", num))
+        .map(|num| format!("author{num}"))
        .collect::<Vec<_>>();
    {
        let mut rng = StdRng::from_seed([1u8; 32]);
--- a/benches/index-bench.rs
+++ b/benches/index-bench.rs
@@ -18,7 +18,7 @@ fn benchmark(
        benchmark_dynamic_json(b, input, schema, commit, parse_json)
    } else {
        _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
-            TantivyDocument::parse_json(&schema, doc_json).unwrap()
+            TantivyDocument::parse_json(schema, doc_json).unwrap()
        })
    }
 }
@@ -90,8 +90,7 @@ fn benchmark_dynamic_json(
 ) {
    let json_field = schema.get_field("json").unwrap();
    _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
-        let json_val: serde_json::Map<String, serde_json::Value> =
-            serde_json::from_str(doc_json).unwrap();
+        let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
        tantivy::doc!(json_field=>json_val)
    })
 }
@@ -138,15 +137,16 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
    for (prefix, schema, is_dynamic) in benches {
        for commit in [false, true] {
            let suffix = if commit { "with-commit" } else { "no-commit" };
-            for parse_json in [false] {
+            {
+                let parse_json = false;
                // for parse_json in [false, true] {
                let suffix = if parse_json {
-                    format!("{}-with-json-parsing", suffix)
+                    format!("{suffix}-with-json-parsing")
                } else {
-                    format!("{}", suffix)
+                    suffix.to_string()
                };

-                let bench_name = format!("{}{}", prefix, suffix);
+                let bench_name = format!("{prefix}{suffix}");
                group.bench_function(bench_name, |b| {
                    benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
                });
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
 categories = ["database-implementations", "data-structures", "compression"]

 [dependencies]
-itertools = "0.12.0"
+itertools = "0.13.0"
 fastdivide = "0.4.0"

 stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
 proptest = "1"
 more-asserts = "0.3.1"
 rand = "0.8"
+binggan = "0.8.1"
+
+[[bench]]
+name = "bench_merge"
+harness = false
+

 [features]
 unstable = []
--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -0,0 +1,97 @@
+#![feature(test)]
+extern crate test;
+
+use core::fmt;
+use std::fmt::{Display, Formatter};
+
+use binggan::{black_box, BenchRunner};
+use tantivy_columnar::*;
+
+enum Card {
+    Multi,
+    Sparse,
+    Dense,
+}
+impl Display for Card {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            Card::Multi => write!(f, "multi"),
+            Card::Sparse => write!(f, "sparse"),
+            Card::Dense => write!(f, "dense"),
+        }
+    }
+}
+
+const NUM_DOCS: u32 = 1_000_000;
+
+fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
+    use tantivy_columnar::ColumnarWriter;
+
+    let mut columnar_writer = ColumnarWriter::default();
+
+    match card {
+        Card::Multi => {
+            columnar_writer.record_numerical(0, "price", 10u64);
+            columnar_writer.record_numerical(0, "price", 10u64);
+        }
+        _ => {}
+    }
+
+    for i in 0..num_docs {
+        match card {
+            Card::Multi | Card::Sparse => {
+                if i % 8 == 0 {
+                    columnar_writer.record_numerical(i, "price", i as u64);
+                }
+            }
+            Card::Dense => {
+                if i % 6 == 0 {
+                    columnar_writer.record_numerical(i, "price", i as u64);
+                }
+            }
+        }
+    }
+
+    let mut wrt: Vec<u8> = Vec::new();
+    columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
+
+    ColumnarReader::open(wrt).unwrap()
+}
+fn main() {
+    let mut inputs = Vec::new();
+
+    let mut add_combo = |card1: Card, card2: Card| {
+        inputs.push((
+            format!("merge_{card1}_and_{card2}"),
+            vec![
+                generate_columnar(card1, NUM_DOCS),
+                generate_columnar(card2, NUM_DOCS),
+            ],
+        ));
+    };
+
+    add_combo(Card::Multi, Card::Multi);
+    add_combo(Card::Dense, Card::Dense);
+    add_combo(Card::Sparse, Card::Sparse);
+    add_combo(Card::Sparse, Card::Dense);
+    add_combo(Card::Multi, Card::Dense);
+    add_combo(Card::Multi, Card::Sparse);
+
+    let runner: BenchRunner = BenchRunner::new();
+    let mut group = runner.new_group();
+    for (input_name, columnar_readers) in inputs.iter() {
+        group.register_with_input(
+            input_name,
+            columnar_readers,
+            move |columnar_readers: &Vec<ColumnarReader>| {
+                let mut out = vec![];
+                let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
+                let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
+
+                merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
+                black_box(out);
+            },
+        );
+    }
+    group.run();
+}
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -73,14 +73,18 @@ fn detect_cardinality(
 pub fn merge_column_index<'a>(
    columns: &'a [ColumnIndex],
    merge_row_order: &'a MergeRowOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    // For simplification, we do not try to detect whether the cardinality could be
    // downgraded thanks to deletes.
    let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
    match merge_row_order {
-        MergeRowOrder::Stack(stack_merge_order) => {
-            merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
-        }
+        MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
+            columns,
+            cardinality_after_merge,
+            stack_merge_order,
+            num_values,
+        ),
        MergeRowOrder::Shuffled(complex_merge_order) => {
            merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
        }
@@ -167,8 +171,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
        Cardinality::Multivalued => {
            let multivalue_start_index =
                merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
-            SerializableColumnIndex::Multivalued(multivalue_start_index)
+            SerializableColumnIndex::Multivalued {
+                indices: multivalue_start_index,
+                stats: None,
+            }
        }
    }
 }
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -1,6 +1,8 @@
 use std::iter;
+use std::num::NonZeroU64;

 use crate::column_index::{SerializableColumnIndex, Set};
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};

@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
    columns: &'a [ColumnIndex],
    cardinality_after_merge: Cardinality,
    stack_merge_order: &'a StackMergeOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    match cardinality_after_merge {
        Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
                columns,
                stack_merge_order,
            };
-            SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(stacked_multivalued_index),
+                stats: Some(ColumnStats {
+                    gcd: NonZeroU64::new(1).unwrap(),
+                    // The values in the multivalue index are the positions of the values
+                    min_value: 0,
+                    max_value: num_values as u64,
+                    // This is num docs, but it starts at 0 so we need +1
+                    num_rows: stack_merge_order.num_rows() + 1,
+                }),
+            }
        }
    }
 }
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -6,20 +6,29 @@ use std::sync::Arc;
 use common::OwnedBytes;

 use crate::column_values::{
-    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
+    load_u64_based_column_values, serialize_u64_based_column_values,
+    serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
 };
 use crate::iterable::Iterable;
 use crate::{DocId, RowId};

 pub fn serialize_multivalued_index(
    multivalued_index: &dyn Iterable<RowId>,
+    stats: Option<ColumnStats>,
    output: &mut impl Write,
 ) -> io::Result<()> {
-    serialize_u64_based_column_values(
-        multivalued_index,
-        &[CodecType::Bitpacked, CodecType::Linear],
-        output,
-    )?;
+    if let Some(stats) = stats {
+        // TODO: Add something with higher compression that doesn't require a full scan upfront
+        let estimator = CodecType::Bitpacked.estimator();
+        assert!(!estimator.requires_full_scan());
+        serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
+    } else {
+        serialize_u64_based_column_values(
+            multivalued_index,
+            &[CodecType::Bitpacked, CodecType::Linear],
+            output,
+        )?;
+    }
    Ok(())
 }

@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
 impl MultiValueIndex {
    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
        let mut buffer = Vec::new();
-        serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
+        serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
        let bytes = OwnedBytes::new(buffer);
        open_multivalued_index(bytes).unwrap()
    }
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
        } = row_addr_from_row_id(doc_id);
        let block_meta = self.block_metas[block_id as usize];
        let block = self.block(block_meta);
+
        let block_offset_row_id = match block {
            Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
            Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
 use crate::column_index::ColumnIndex;
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, RowId};

@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
        non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
        num_rows: RowId,
    },
-    // TODO remove the Arc<dyn> apart from serialization this is not
-    // dynamic at all.
-    Multivalued(Box<dyn Iterable<RowId> + 'a>),
+    Multivalued {
+        /// Iterator emitting the indices for the index
+        indices: Box<dyn Iterable<RowId> + 'a>,
+        /// In the merge case we can precompute the column stats
+        stats: Option<ColumnStats>,
+    },
 }

 impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
        match self {
            SerializableColumnIndex::Full => Cardinality::Full,
            SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
-            SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
+            SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
        }
    }
 }
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
            non_null_row_ids,
            num_rows,
        } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
-        SerializableColumnIndex::Multivalued(multivalued_index) => {
-            serialize_multivalued_index(&*multivalued_index, &mut output)?
-        }
+        SerializableColumnIndex::Multivalued {
+            indices: multivalued_index,
+            stats,
+        } => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
    }
    let column_index_num_bytes = output.written_bytes() as u32;
    Ok(column_index_num_bytes)
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -32,7 +32,8 @@ pub use u128_based::{
 };
 pub use u64_based::{
    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
-    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
+    serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
+    ALL_U64_CODEC_TYPES,
 };
 pub use vec_column::VecColumn;

--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
        bit_packer.close(wrt)?;
        Ok(())
    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Bitpacked
+    }
 }

 pub struct BitpackedCodec;
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {

        Ok(())
    }
+
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::BlockwiseLinear
+    }
 }

 pub struct BlockwiseLinearCodec;
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
            self.collect_before_line_estimation(value);
        }
    }
+    fn requires_full_scan(&self) -> bool {
+        true
+    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Linear
+    }
 }

 impl LinearCodecEstimator {
--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
    /// This method will be called for each element of the column during
    /// `estimation`.
    fn collect(&mut self, value: u64);
-    /// Finalizes the first pass phase.
+    /// Returns true if the estimator needs a full pass over the column before serialization
+    fn requires_full_scan(&self) -> bool {
+        false
+    }
+    fn codec_type(&self) -> CodecType;
    fn finalize(&mut self) {}
    /// Returns an accurate estimation of the number of bytes that will
    /// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
    wrt: &mut dyn Write,
 ) -> io::Result<()> {
    let mut stats_collector = StatsCollector::default();
-    let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
-        Vec::with_capacity(codec_types.len());
+    let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
    for &codec_type in codec_types {
-        estimators.push((codec_type, codec_type.estimator()));
+        estimators.push(codec_type.estimator());
    }
    for val in vals.boxed_iter() {
        let val_u64 = val.to_u64();
        stats_collector.collect(val_u64);
-        for (_, estimator) in &mut estimators {
+        for estimator in &mut estimators {
            estimator.collect(val_u64);
        }
    }
-    for (_, estimator) in &mut estimators {
+    for estimator in &mut estimators {
        estimator.finalize();
    }
    let stats = stats_collector.stats();
-    let (_, best_codec, best_codec_estimator) = estimators
+    let (_, best_codec) = estimators
        .into_iter()
-        .flat_map(|(codec_type, estimator)| {
+        .flat_map(|estimator| {
            let num_bytes = estimator.estimate(&stats)?;
-            Some((num_bytes, codec_type, estimator))
+            Some((num_bytes, estimator))
        })
-        .min_by_key(|(num_bytes, _, _)| *num_bytes)
+        .min_by_key(|(num_bytes, _)| *num_bytes)
        .ok_or_else(|| {
            io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
        })?;
-    best_codec.to_code().serialize(wrt)?;
-    best_codec_estimator.serialize(
+    serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
+    Ok(())
+}
+
+/// Serializes a given column of u64-mapped values.
+/// The codec estimator needs to be collected fully for the Line codec before calling this.
+pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
+    vals: &dyn Iterable<T>,
+    codec: Box<dyn ColumnCodecEstimator>,
+    stats: ColumnStats,
+    wrt: &mut dyn Write,
+) -> io::Result<()> {
+    codec.codec_type().to_code().serialize(wrt)?;
+    codec.serialize(
        &stats,
        &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
        wrt,
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -3,7 +3,7 @@ mod merge_mapping;
 mod term_merger;

 use std::collections::{BTreeMap, HashSet};
-use std::io;
+use std::io::{self};
 use std::net::Ipv6Addr;
 use std::sync::Arc;

@@ -156,8 +156,15 @@ fn merge_column(
                    column_values.push(None);
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
                }
            }

-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
                    }
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = bytes_columns
+                .iter()
+                .map(|vals| {
+                    vals.as_ref()
+                        .map(|idx| idx.term_ord_column.values.num_vals())
+                        .unwrap_or(0)
+                })
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
        }
    }
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
            let multivalued_index = multivalued_index_builder.finish(num_rows);
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: Default::default(), // TODO: implement stats for u128
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
            if sort_values_within_row {
                sort_values_within_row_in_place(multivalued_index, values);
            }
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: None,
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u64(
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -738,35 +738,22 @@ proptest! {
    #![proptest_config(ProptestConfig::with_cases(1000))]
    #[test]
    fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
-        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
-            .map(|docs| build_columnar(&docs[..]))
-            .collect::<Vec<_>>();
-        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-        let mut output: Vec<u8> = Vec::new();
-        let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
-        crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
-        let merged_columnar = ColumnarReader::open(output).unwrap();
-        let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
-        let expected_merged_columnar = build_columnar(&concat_rows[..]);
-        assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+        test_columnar_docs(columnar_docs);
    }
 }

-#[test]
-fn test_columnar_merging_empty_columnar() {
-    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
-        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
    let columnar_readers: Vec<ColumnarReader> = columnar_docs
        .iter()
        .map(|docs| build_columnar(&docs[..]))
        .collect::<Vec<_>>();
    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
+    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
    crate::merge_columnar(
        &columnar_readers_arr[..],
        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
+        stack_merge_order,
        &mut output,
    )
    .unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
 }

+#[test]
+fn test_columnar_merging_empty_columnar() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
+        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+    test_columnar_docs(columnar_docs);
+}
+#[test]
+fn test_columnar_merging_simple() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
+        vec![],
+        vec![vec![
+            ("c1", ColumnValue::Numerical(0u64.into())),
+            ("c1", ColumnValue::Numerical(0u64.into())),
+        ]],
+    ];
+    test_columnar_docs(columnar_docs);
+}
+
 #[test]
 fn test_columnar_merging_number_columns() {
    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
            vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
        ],
    ];
-    let columnar_readers: Vec<ColumnarReader> = columnar_docs
-        .iter()
-        .map(|docs| build_columnar(&docs[..]))
-        .collect::<Vec<_>>();
-    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
-    crate::merge_columnar(
-        &columnar_readers_arr[..],
-        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
-        &mut output,
-    )
-    .unwrap();
-    let merged_columnar = ColumnarReader::open(output).unwrap();
-    let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
-        columnar_docs.iter().flatten().cloned().collect();
-    let expected_merged_columnar = build_columnar(&concat_rows[..]);
-    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+    test_columnar_docs(columnar_docs);
 }

 // TODO add non trivial remap and merge
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
 [dev-dependencies]
 proptest = "1.0.0"
 rand = "0.8.4"
+
+[features]
+unstable = [] # useful for benches.
--- a/common/src/vint.rs
+++ b/common/src/vint.rs
@@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
    (result, vlen)
 }
 /// Write a `u32` as a vint payload.
-pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
+pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> {
    let mut buf = [0u8; 8];
    let data = serialize_vint_u32(val, &mut buf);
    writer.write_all(data)
--- a/examples/date_time_field.rs
+++ b/examples/date_time_field.rs
@@ -4,7 +4,7 @@

 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
-use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
+use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
 use tantivy::{Index, IndexWriter, TantivyDocument};

 fn main() -> tantivy::Result<()> {
@@ -61,10 +61,12 @@ fn main() -> tantivy::Result<()> {
        assert_eq!(count_docs.len(), 1);
        for (_score, doc_address) in count_docs {
            let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
-            assert!(matches!(
-                retrieved_doc.get_first(occurred_at),
-                Some(OwnedValue::Date(_))
-            ));
+            assert!(retrieved_doc
+                .get_first(occurred_at)
+                .unwrap()
+                .as_value()
+                .as_datetime()
+                .is_some(),);
            assert_eq!(
                retrieved_doc.to_json(&schema),
                r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
--- a/examples/faceted_search_with_tweaked_score.rs
+++ b/examples/faceted_search_with_tweaked_score.rs
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
    let reader = index.reader()?;
    let searcher = reader.searcher();
    {
-        let facets = vec![
+        let facets = [
            Facet::from("/ingredient/egg"),
            Facet::from("/ingredient/oil"),
            Facet::from("/ingredient/garlic"),
@@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> {
                    .doc::<TantivyDocument>(*doc_id)
                    .unwrap()
                    .get_first(title)
-                    .and_then(|v| v.as_str())
+                    .and_then(|v| v.as_str().map(|el| el.to_string()))
                    .unwrap()
-                    .to_owned()
            })
            .collect();
        assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
--- a/examples/index_from_multiple_threads.rs
+++ b/examples/index_from_multiple_threads.rs
@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
                        debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
                        limbs and branches that arch over the pool"
                    ))?;
-            println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
+            println!("add doc {i} from thread 1 - opstamp {opstamp}");
            thread::sleep(Duration::from_millis(20));
        }
        Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
                    body => "Some great book description..."
                ))?
            };
-            println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
+            println!("add doc {i} from thread 2 - opstamp {opstamp}");
            thread::sleep(Duration::from_millis(10));
        }
        Result::<(), TantivyError>::Ok(())
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::iter::once;

 use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
 // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
 // special characters.
 const SPECIAL_CHARS: &[char] = &[
-    '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
+    '+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
 ];

 /// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
    )(inp)
 }

+const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
+
+fn interpret_escape(source: &str) -> String {
+    let mut res = String::with_capacity(source.len());
+    let mut in_escape = false;
+    let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
+
+    for c in source.chars() {
+        if in_escape {
+            if !require_escape(c) {
+                // we re-add the escape sequence
+                res.push('\\');
+            }
+            res.push(c);
+            in_escape = false;
+        } else if c == '\\' {
+            in_escape = true;
+        } else {
+            res.push(c);
+        }
+    }
+    res
+}
+
 /// Consume a word outside of any context.
 // TODO should support escape sequences
-fn word(inp: &str) -> IResult<&str, &str> {
+fn word(inp: &str) -> IResult<&str, Cow<str>> {
    map_res(
        recognize(tuple((
-            satisfy(|c| {
-                !c.is_whitespace()
-                    && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            }),
-            many0(satisfy(|c: char| {
-                !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
-            })),
+            alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
+            )),
+            many0(alt((
+                preceded(char('\\'), anychar),
+                satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
+            ))),
        ))),
        |s| match s {
            "OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
-            _ => Ok(s),
+            s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
+            s => Ok(Cow::Borrowed(s)),
        },
    )(inp)
 }

-fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
-    |inp| {
-        opt_i_err(
-            preceded(
-                multispace0,
-                recognize(many1(satisfy(|c| {
-                    !c.is_whitespace() && !delimiter.contains(c)
-                }))),
+fn word_infallible(
+    delimiter: &str,
+    emit_error: bool,
+) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
+    // emit error is set when receiving an unescaped `:` should emit an error
+
+    move |inp| {
+        map(
+            opt_i_err(
+                preceded(
+                    multispace0,
+                    recognize(many1(alt((
+                        preceded(char::<&str, _>('\\'), anychar),
+                        satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
+                    )))),
+                ),
+                "expected word",
            ),
-            "expected word",
+            |(opt_s, mut errors)| match opt_s {
+                Some(s) => {
+                    if emit_error
+                        && (s
+                            .as_bytes()
+                            .windows(2)
+                            .any(|window| window[0] != b'\\' && window[1] == b':')
+                            || s.starts_with(':'))
+                    {
+                        errors.push(LenientErrorInternal {
+                            pos: inp.len(),
+                            message: "parsed possible invalid field as term".to_string(),
+                        });
+                    }
+                    if s.contains('\\') {
+                        (Some(Cow::Owned(interpret_escape(s))), errors)
+                    } else {
+                        (Some(Cow::Borrowed(s)), errors)
+                    }
+                }
+                None => (None, errors),
+            },
        )(inp)
    }
 }
@@ -159,7 +216,7 @@ fn simple_term_infallible(
                (value((), char('\'')), simple_quotes),
            ),
            // numbers are parsed with words in this case, as we allow string starting with a -
-            map(word_infallible(delimiter), |(text, errors)| {
+            map(word_infallible(delimiter, true), |(text, errors)| {
                (text.map(|text| (Delimiter::None, text.to_string())), errors)
            }),
        )(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
        |((field_name, _, leaf), mut errors)| {
            (
                leaf.map(|leaf| {
-                    if matches!(&leaf, UserInputLeaf::Literal(literal)
-                            if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
-                        && field_name.is_none()
-                    {
-                        errors.push(LenientErrorInternal {
-                            pos: inp.len(),
-                            message: "parsed possible invalid field as term".to_string(),
-                        });
-                    }
                    if matches!(&leaf, UserInputLeaf::Literal(literal)
                            if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
                        && field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
        tuple_infallible((
            opt_i(anychar),
            space0_infallible,
-            word_infallible("]}"),
+            word_infallible("]}", false),
            space1_infallible,
            opt_i_err(
                terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
                "missing keyword TO",
            ),
-            word_infallible("]}"),
+            word_infallible("]}", false),
            opt_i_err(one_of("]}"), "missing range delimiter"),
        )),
        |(
            (lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
            errs,
        )| {
-            let lower_bound = match (lower_bound_kind, lower) {
+            let lower_bound = match (lower_bound_kind, lower.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                // if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
                _ => unreachable!("precondition failed, range did not start with [ or {{"),
            };
-            let upper_bound = match (upper_bound_kind, upper) {
+            let upper_bound = match (upper_bound_kind, upper.as_deref()) {
                (_, Some("*")) => UserInputBound::Unbounded,
                (_, None) => UserInputBound::Unbounded,
                (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
            (
                (
                    value((), tag(">=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<=")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag(">")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<")),
-                    map(word_infallible(""), |(bound, err)| {
+                    map(word_infallible("", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
        test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");

        test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
+
+        test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
+        test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
+        test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
+        test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
    }

    #[test]
@@ -1590,5 +1644,21 @@ mod test {
            r#"myfield:'hello\"happy\'tax'"#,
            r#""myfield":'hello"happy'tax'"#,
        );
+        // we don't process escape sequence for chars which don't require it
+        test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
+    }
+
+    #[test]
+    fn test_queries_with_colons() {
+        test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
+        test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
+        test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
+        test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
+    }
+
+    #[test]
+    fn test_invalid_field() {
+        test_is_parse_err(r#"!bc:def"#, "!bc:def");
    }
 }
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -34,7 +34,7 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
    PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
 };

@@ -146,6 +146,11 @@ pub enum AggregationVariants {
    /// extracted values.
    #[serde(rename = "stats")]
    Stats(StatsAggregation),
+    /// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
+    /// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
+    /// `std_deviation_sampling`) over the  extracted values.
+    #[serde(rename = "extended_stats")]
+    ExtendedStats(ExtendedStatsAggregation),
    /// Computes the sum of the extracted values.
    #[serde(rename = "sum")]
    Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
            AggregationVariants::Max(max) => vec![max.field_name()],
            AggregationVariants::Min(min) => vec![min.field_name()],
            AggregationVariants::Stats(stats) => vec![stats.field_name()],
+            AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
            AggregationVariants::Sum(sum) => vec![sum.field_name()],
            AggregationVariants::Percentiles(per) => vec![per.field_name()],
            AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
            _ => None,
        }
    }
+    pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
+        match &self {
+            AggregationVariants::TopHits(top_hits) => Some(top_hits),
+            _ => None,
+        }
+    }

    pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
        match &self {
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -11,8 +11,8 @@ use super::bucket::{
    DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
-    SumAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
+    StatsAggregation, SumAggregation,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
                field: ref field_name,
                ..
            })
+            | ExtendedStats(ExtendedStatsAggregation {
+                field: ref field_name,
+                ..
+            })
            | Sum(SumAggregation {
                field: ref field_name,
                ..
@@ -335,8 +339,8 @@ fn get_missing_val(
        }
        _ => {
            return Err(crate::TantivyError::InvalidArgument(format!(
-                "Missing value {:?} for field {} is not supported for column type {:?}",
-                missing, field_name, column_type
+                "Missing value {missing:?} for field {field_name} is not supported for column \
+                 type {column_type:?}"
            )));
        }
    };
@@ -403,7 +407,7 @@ fn get_dynamic_columns(
        .iter()
        .map(|h| h.open())
        .collect::<io::Result<_>>()?;
-    assert!(!ff_fields.is_empty(), "field {} not found", field_name);
+    assert!(!ff_fields.is_empty(), "field {field_name} not found");
    Ok(cols)
 }

--- a/src/aggregation/agg_result.rs
+++ b/src/aggregation/agg_result.rs
@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};

 use super::bucket::GetDocCount;
-use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
+use super::metric::{
+    ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
+};
 use super::{AggregationError, Key};
 use crate::TantivyError;

@@ -88,6 +90,8 @@ pub enum MetricResult {
    Min(SingleMetricResult),
    /// Stats metric result.
    Stats(Stats),
+    /// ExtendedStats metric result.
+    ExtendedStats(Box<ExtendedStats>),
    /// Sum metric result.
    Sum(SingleMetricResult),
    /// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
            MetricResult::Max(max) => Ok(max.value),
            MetricResult::Min(min) => Ok(min.value),
            MetricResult::Stats(stats) => stats.get_value(agg_property),
+            MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
            MetricResult::Sum(sum) => Ok(sum.value),
            MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
                AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -357,8 +357,7 @@ impl SegmentTermCollector {
    ) -> crate::Result<Self> {
        if field_type == ColumnType::Bytes {
            return Err(TantivyError::InvalidArgument(format!(
-                "terms aggregation is not supported for column type {:?}",
-                field_type
+                "terms aggregation is not supported for column type {field_type:?}"
            )));
        }
        let term_buckets = TermBuckets::default();
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -19,8 +19,8 @@ use super::bucket::{
    GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
 };
 use super::metric::{
-    IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
-    IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
+    IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
+    IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
 };
 use super::segment_agg_result::AggregationLimits;
 use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
        Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
            IntermediateStats::default(),
        )),
+        ExtendedStats(_) => IntermediateAggregationResult::Metric(
+            IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
+        ),
        Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
            IntermediateSum::default(),
        )),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
            IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
        ),
        TopHits(ref req) => IntermediateAggregationResult::Metric(
-            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
+            IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
        ),
    }
 }
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
    Min(IntermediateMin),
    /// Intermediate stats result.
    Stats(IntermediateStats),
+    /// Intermediate stats result.
+    ExtendedStats(IntermediateExtendedStats),
    /// Intermediate sum result.
    Sum(IntermediateSum),
    /// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
            IntermediateMetricResult::Stats(intermediate_stats) => {
                MetricResult::Stats(intermediate_stats.finalize())
            }
+            IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
+                MetricResult::ExtendedStats(intermediate_stats.finalize())
+            }
            IntermediateMetricResult::Sum(intermediate_sum) => {
                MetricResult::Sum(intermediate_sum.finalize().into())
            }
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
            ) => {
                stats_left.merge_fruits(stats_right);
            }
+            (
+                IntermediateMetricResult::ExtendedStats(extended_stats_left),
+                IntermediateMetricResult::ExtendedStats(extended_stats_right),
+            ) => {
+                extended_stats_left.merge_fruits(extended_stats_right);
+            }
            (IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
                sum_left.merge_fruits(sum_right);
            }
--- a/src/aggregation/metric/extended_stats.rs
+++ b/src/aggregation/metric/extended_stats.rs
--- a/src/aggregation/metric/mod.rs
+++ b/src/aggregation/metric/mod.rs
@@ -18,6 +18,7 @@

 mod average;
 mod count;
+mod extended_stats;
 mod max;
 mod min;
 mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;

 pub use average::*;
 pub use count::*;
+pub use extended_stats::*;
 pub use max::*;
 pub use min::*;
 pub use percentiles::*;
--- a/src/aggregation/metric/stats.rs
+++ b/src/aggregation/metric/stats.rs
@@ -1,3 +1,5 @@
+use std::fmt::Debug;
+
 use serde::{Deserialize, Serialize};

 use super::*;
@@ -85,13 +87,15 @@ impl Stats {
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct IntermediateStats {
    /// The number of extracted values.
-    count: u64,
+    pub(crate) count: u64,
    /// The sum of the extracted values.
-    sum: f64,
+    pub(crate) sum: f64,
+    /// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+    pub(crate) delta: f64,
    /// The min value.
-    min: f64,
+    pub(crate) min: f64,
    /// The max value.
-    max: f64,
+    pub(crate) max: f64,
 }

 impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
        Self {
            count: 0,
            sum: 0.0,
+            delta: 0.0,
            min: f64::MAX,
            max: f64::MIN,
        }
@@ -109,7 +114,13 @@ impl IntermediateStats {
    /// Merges the other stats intermediate result into self.
    pub fn merge_fruits(&mut self, other: IntermediateStats) {
        self.count += other.count;
-        self.sum += other.sum;
+
+        // kahan algorithm for sum
+        let y = other.sum - (self.delta + other.delta);
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(other.min);
        self.max = self.max.max(other.max);
    }
@@ -141,9 +152,15 @@ impl IntermediateStats {
    }

    #[inline]
-    fn collect(&mut self, value: f64) {
+    pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
        self.count += 1;
-        self.sum += value;
+
+        // kahan algorithm for sum
+        let y = value - self.delta;
+        let t = self.sum + y;
+        self.delta = (t - self.sum) - y;
+        self.sum = t;
+
        self.min = self.min.min(value);
        self.max = self.max.max(value);
    }
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {

 #[cfg(test)]
 mod tests {
-
    use serde_json::Value;

    use crate::aggregation::agg_req::{Aggregation, Aggregations};
--- a/src/aggregation/metric/top_hits.rs
+++ b/src/aggregation/metric/top_hits.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 use std::net::Ipv6Addr;

-use columnar::{ColumnarReader, DynamicColumn};
+use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
 use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
 use common::DateTime;
 use regex::Regex;
@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
        ))?;
        if key_order.next().is_some() {
            return Err(serde::de::Error::custom(format!(
-                "Expected exactly one key-value pair in sort parameter of top_hits, found {:?}",
-                key_order
+                "Expected exactly one key-value pair in sort parameter of top_hits, found \
+                 {key_order:?}"
            )));
        }
        Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
    // Replace `*` glob with `.*` regex
    let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
    Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
-        crate::TantivyError::SchemaError(format!(
-            "Invalid regex '{}' in docvalue_fields: {}",
-            glob, e
-        ))
+        crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
    })
 }

 fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
    Err(crate::TantivyError::AggregationError(
        AggregationError::InvalidRequest(format!(
-            "The `{}` parameter is not supported, only `docvalue_fields` is supported in \
-             `top_hits` aggregation",
-            parameter
+            "The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
+             `top_hits` aggregation"
        )),
    ))
 }
 fn unsupported_err(parameter: &str) -> crate::Result<()> {
    Err(crate::TantivyError::AggregationError(
        AggregationError::InvalidRequest(format!(
-            "The `{}` parameter is not supported in the `top_hits` aggregation",
-            parameter
+            "The `{parameter}` parameter is not supported in the `top_hits` aggregation"
        )),
    ))
 }
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
                    .collect::<Vec<_>>();
                assert!(
                    !fields.is_empty(),
-                    "No fields matched the glob '{}' in docvalue_fields",
-                    field
+                    "No fields matched the glob '{field}' in docvalue_fields"
                );
                Ok(fields)
            })
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
            .map(|field| {
                let accessors = accessors
                    .get(field)
-                    .unwrap_or_else(|| panic!("field '{}' not found in accessors", field));
+                    .unwrap_or_else(|| panic!("field '{field}' not found in accessors"));

                let values: Vec<FastFieldValue> = accessors
                    .iter()
@@ -449,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {

 impl TopHitsTopNComputer {
    /// Create a new TopHitsCollector
-    pub fn new(req: TopHitsAggregation) -> Self {
+    pub fn new(req: &TopHitsAggregation) -> Self {
        Self {
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
-            req,
+            req: req.clone(),
        }
    }

@@ -497,7 +491,6 @@ impl TopHitsTopNComputer {
 pub(crate) struct TopHitsSegmentCollector {
    segment_ordinal: SegmentOrdinal,
    accessor_idx: usize,
-    req: TopHitsAggregation,
    top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
 }

@@ -508,7 +501,6 @@ impl TopHitsSegmentCollector {
        segment_ordinal: SegmentOrdinal,
    ) -> Self {
        Self {
-            req: req.clone(),
            top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
            segment_ordinal,
            accessor_idx,
@@ -517,14 +509,13 @@ impl TopHitsSegmentCollector {
    fn into_top_hits_collector(
        self,
        value_accessors: &HashMap<String, Vec<DynamicColumn>>,
+        req: &TopHitsAggregation,
    ) -> TopHitsTopNComputer {
-        let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
+        let mut top_hits_computer = TopHitsTopNComputer::new(req);
        let top_results = self.top_n.into_vec();

        for res in top_results {
-            let doc_value_fields = self
-                .req
-                .get_document_field_data(value_accessors, res.doc.doc_id);
+            let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
            top_hits_computer.collect(
                DocSortValuesAndFields {
                    sorts: res.feature,
@@ -536,34 +527,15 @@ impl TopHitsSegmentCollector {

        top_hits_computer
    }
-}

-impl SegmentAggregationCollector for TopHitsSegmentCollector {
-    fn add_intermediate_aggregation_result(
-        self: Box<Self>,
-        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
-        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
-    ) -> crate::Result<()> {
-        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
-
-        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
-
-        let intermediate_result =
-            IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
-        results.push(
-            name,
-            IntermediateAggregationResult::Metric(intermediate_result),
-        )
-    }
-
-    fn collect(
+    /// TODO add a specialized variant for a single sort field
+    fn collect_with(
        &mut self,
        doc_id: crate::DocId,
-        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        req: &TopHitsAggregation,
+        accessors: &[(Column<u64>, ColumnType)],
    ) -> crate::Result<()> {
-        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
-        let sorts: Vec<DocValueAndOrder> = self
-            .req
+        let sorts: Vec<DocValueAndOrder> = req
            .sort
            .iter()
            .enumerate()
@@ -588,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
        );
        Ok(())
    }
+}
+
+impl SegmentAggregationCollector for TopHitsSegmentCollector {
+    fn add_intermediate_aggregation_result(
+        self: Box<Self>,
+        agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+        results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
+    ) -> crate::Result<()> {
+        let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
+
+        let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+
+        let intermediate_result = IntermediateMetricResult::TopHits(
+            self.into_top_hits_collector(value_accessors, tophits_req),
+        );
+        results.push(
+            name,
+            IntermediateAggregationResult::Metric(intermediate_result),
+        )
+    }
+
+    /// TODO: Consider a caching layer to reduce the call overhead
+    fn collect(
+        &mut self,
+        doc_id: crate::DocId,
+        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
+    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
+        self.collect_with(doc_id, tophits_req, accessors)?;
+        Ok(())
+    }

    fn collect_block(
        &mut self,
        docs: &[crate::DocId],
        agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
    ) -> crate::Result<()> {
+        let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
+            .agg
+            .agg
+            .as_top_hits()
+            .expect("aggregation request must be of type top hits");
+        let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
        // TODO: Consider getting fields with the column block accessor.
        for doc in docs {
-            self.collect(*doc, agg_with_accessor)?;
+            self.collect_with(*doc, tophits_req, accessors)?;
        }
        Ok(())
    }
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -158,15 +158,14 @@ use serde::de::{self, Visitor};
 use serde::{Deserialize, Deserializer, Serialize};

 fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
-    let parsed = value.parse::<f64>().map_err(|_err| {
-        de::Error::custom(format!("Failed to parse f64 from string: {:?}", value))
-    })?;
+    let parsed = value
+        .parse::<f64>()
+        .map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;

    // Check if the parsed value is NaN or infinity
    if parsed.is_nan() || parsed.is_infinite() {
        Err(de::Error::custom(format!(
-            "Value is not a valid f64 (NaN or Infinity): {:?}",
-            value
+            "Value is not a valid f64 (NaN or Infinity): {value:?}"
        )))
    } else {
        Ok(parsed)
--- a/src/aggregation/segment_agg_result.rs
+++ b/src/aggregation/segment_agg_result.rs
@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
 use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
 use super::intermediate_agg_result::IntermediateAggregationResults;
 use super::metric::{
-    AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
+    AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
    SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
    SumAggregation,
 };
 use crate::aggregation::bucket::TermMissingAgg;
-use crate::aggregation::metric::TopHitsSegmentCollector;
+use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};

 pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
    fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
            accessor_idx,
            *missing,
        ))),
+        ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
+            SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
+        )),
        Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
            req.field_type,
            SegmentStatsType::Sum,
--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -598,7 +598,7 @@ mod tests {
                let mid = n % 4;
                n /= 4;
                let leaf = n % 5;
-                Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
+                Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
            })
            .collect();
        for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
            vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
                .into_iter()
                .flat_map(|(c, count)| {
-                    let facet = Facet::from(&format!("/facet/{}", c));
+                    let facet = Facet::from(&format!("/facet/{c}"));
                    let doc = doc!(facet_field => facet);
                    iter::repeat(doc).take(count)
                })
@@ -785,7 +785,7 @@ mod tests {
        let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
            .into_iter()
            .flat_map(|(c, count)| {
-                let facet = Facet::from(&format!("/facet/{}", c));
+                let facet = Facet::from(&format!("/facet/{c}"));
                let doc = doc!(facet_field => facet);
                iter::repeat(doc).take(count)
            })
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -871,7 +871,10 @@ mod tests {
    use crate::schema::{Field, Schema, FAST, STORED, TEXT};
    use crate::time::format_description::well_known::Rfc3339;
    use crate::time::OffsetDateTime;
-    use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
+    use crate::{
+        assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
+        SegmentReader,
+    };

    fn make_index() -> crate::Result<Index> {
        let mut schema_builder = Schema::builder();
--- a/src/core/executor.rs
+++ b/src/core/executor.rs
@@ -184,28 +184,56 @@ mod tests {
    fn test_cancel_cpu_intensive_tasks() {
        use std::sync::atomic::{AtomicU64, Ordering};
        use std::sync::Arc;
-        use std::time::Duration;

        let counter: Arc<AtomicU64> = Default::default();
+
+        let other_counter: Arc<AtomicU64> = Default::default();
+
        let mut futures = Vec::new();
+        let mut other_futures = Vec::new();
+
+        let (tx, rx) = crossbeam_channel::bounded::<()>(0);
+        let rx = Arc::new(rx);
        let executor = Executor::multi_thread(3, "search-test").unwrap();
-        for _ in 0..1_000 {
-            let counter_clone = counter.clone();
+        for _ in 0..1000 {
+            let counter_clone: Arc<AtomicU64> = counter.clone();
+            let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
+
+            let rx_clone = rx.clone();
+            let rx_clone2 = rx.clone();
            let fut = executor.spawn_blocking(move || {
-                std::thread::sleep(Duration::from_millis(4));
-                counter_clone.fetch_add(1, Ordering::SeqCst)
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+                let _ = rx_clone.recv();
            });
            futures.push(fut);
+            let other_fut = executor.spawn_blocking(move || {
+                other_counter_clone.fetch_add(1, Ordering::SeqCst);
+                let _ = rx_clone2.recv();
+            });
+            other_futures.push(other_fut);
        }
-        std::thread::sleep(Duration::from_millis(5));
-        // The first few num_cores tasks should run, but the other should get cancelled.
-        drop(futures);
-        while Arc::strong_count(&counter) > 1 {
-            std::thread::sleep(Duration::from_millis(10));
+
+        // We execute 100 futures.
+        for _ in 0..100 {
+            tx.send(()).unwrap();
        }
-        // with ideal timing, we expect the result to always be 6, but as long as we run some, and
-        // cancelled most, the test is a success
-        assert!(counter.load(Ordering::SeqCst) > 0);
-        assert!(counter.load(Ordering::SeqCst) < 50);
+
+        let counter_val = counter.load(Ordering::SeqCst);
+        let other_counter_val = other_counter.load(Ordering::SeqCst);
+        assert!(counter_val >= 30);
+        assert!(other_counter_val >= 30);
+
+        drop(other_futures);
+
+        // We execute 100 futures.
+        for _ in 0..100 {
+            tx.send(()).unwrap();
+        }
+
+        let counter_val2 = counter.load(Ordering::SeqCst);
+        assert!(counter_val2 >= counter_val + 100 - 6);
+
+        let other_counter_val2 = other_counter.load(Ordering::SeqCst);
+        assert!(other_counter_val2 <= other_counter_val + 6);
    }
 }
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -338,14 +338,14 @@ mod tests {
        let mut term = Term::from_field_json_path(field, "attributes.color", false);
        term.append_type_and_str("red");
        assert_eq!(
-            format!("{:?}", term),
+            format!("{term:?}"),
            "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
        );

        let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
        term.append_type_and_fast_value(400i64);
        assert_eq!(
-            format!("{:?}", term),
+            format!("{term:?}"),
            "Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
        );
    }
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -566,7 +566,7 @@ mod tests {
        let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
        let num_paths = 10;
        let paths: Vec<PathBuf> = (0..num_paths)
-            .map(|i| PathBuf::from(&*format!("file_{}", i)))
+            .map(|i| PathBuf::from(&*format!("file_{i}")))
            .collect();
        {
            for path in &paths {
--- a/src/fastfield/facet_reader.rs
+++ b/src/fastfield/facet_reader.rs
@@ -62,8 +62,7 @@ impl FacetReader {

 #[cfg(test)]
 mod tests {
-    use crate::schema::document::Value;
-    use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
+    use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
    use crate::{DocAddress, Index, IndexWriter, TantivyDocument};

    #[test]
@@ -89,7 +88,9 @@ mod tests {
        let doc = searcher
            .doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
            .unwrap();
-        let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
+        let value = doc
+            .get_first(facet_field)
+            .and_then(|v| v.as_value().as_facet());
        assert_eq!(value, None);
    }

--- a/src/index/index.rs
+++ b/src/index/index.rs
@@ -252,9 +252,8 @@ impl IndexBuilder {
                let field_type = entry.field_type().value_type();
                if !supported_field_types.contains(&field_type) {
                    return Err(TantivyError::InvalidArgument(format!(
-                        "Unsupported field type in sort_by_field: {:?}. Supported field types: \
-                         {:?} ",
-                        field_type, supported_field_types,
+                        "Unsupported field type in sort_by_field: {field_type:?}. Supported field \
+                         types: {supported_field_types:?} ",
                    )));
                }
            }
--- a/src/index/segment_reader.rs
+++ b/src/index/segment_reader.rs
@@ -318,14 +318,14 @@ impl SegmentReader {
                        if create_canonical {
                            // Without expand dots enabled dots need to be escaped.
                            let escaped_json_path = json_path.replace('.', "\\.");
-                            let full_path = format!("{}.{}", field_name, escaped_json_path);
+                            let full_path = format!("{field_name}.{escaped_json_path}");
                            let full_path_unescaped = format!("{}.{}", field_name, &json_path);
                            map_to_canonical.insert(full_path_unescaped, full_path.to_string());
                            full_path
                        } else {
                            // With expand dots enabled, we can use '.' instead of '\u{1}'.
                            json_path_sep_to_dot(&mut json_path);
-                            format!("{}.{}", field_name, json_path)
+                            format!("{field_name}.{json_path}")
                        }
                    };
                    indexed_fields.extend(
--- a/src/indexer/doc_id_mapping.rs
+++ b/src/indexer/doc_id_mapping.rs
@@ -306,12 +306,10 @@ mod tests_indexsorting {
        let my_string_field = index.schema().get_field("string_field").unwrap();
        let searcher = index.reader()?.searcher();
        {
-            assert_eq!(
-                searcher
-                    .doc::<TantivyDocument>(DocAddress::new(0, 0))?
-                    .get_first(my_string_field),
-                None
-            );
+            assert!(searcher
+                .doc::<TantivyDocument>(DocAddress::new(0, 0))?
+                .get_first(my_string_field)
+                .is_none());
            assert_eq!(
                searcher
                    .doc::<TantivyDocument>(DocAddress::new(0, 3))?
@@ -344,7 +342,7 @@ mod tests_indexsorting {
                Some("blublub")
            );
            let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
-            assert_eq!(doc.get_first(my_string_field), None);
+            assert!(doc.get_first(my_string_field).is_none());
        }
        // sort by field desc
        let index = create_test_index(
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -808,16 +808,15 @@ mod tests {
    use proptest::prop_oneof;

    use super::super::operation::UserOperation;
-    use crate::collector::TopDocs;
+    use crate::collector::{Count, TopDocs};
    use crate::directory::error::LockError;
    use crate::error::*;
    use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
    use crate::indexer::NoMergePolicy;
    use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
-    use crate::schema::document::Value;
    use crate::schema::{
        self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
-        TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
+        TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
    };
    use crate::store::DOCSTORE_CACHE_CAPACITY;
    use crate::{
@@ -1573,20 +1572,74 @@ mod tests {
        Ok(())
    }

-    #[derive(Debug, Clone, Copy)]
+    #[derive(Debug, Clone)]
    enum IndexingOp {
-        AddDoc { id: u64 },
-        DeleteDoc { id: u64 },
-        DeleteDocQuery { id: u64 },
+        AddMultipleDoc {
+            id: u64,
+            num_docs: u64,
+            value: IndexValue,
+        },
+        AddDoc {
+            id: u64,
+            value: IndexValue,
+        },
+        DeleteDoc {
+            id: u64,
+        },
+        DeleteDocQuery {
+            id: u64,
+        },
        Commit,
        Merge,
    }
+    impl IndexingOp {
+        fn add(id: u64) -> Self {
+            IndexingOp::AddDoc {
+                id,
+                value: IndexValue::F64(id as f64),
+            }
+        }
+    }
+
+    use serde::Serialize;
+    #[derive(Debug, Clone, Serialize)]
+    #[serde(untagged)]
+    enum IndexValue {
+        Str(String),
+        F64(f64),
+        U64(u64),
+        I64(i64),
+    }
+    impl Default for IndexValue {
+        fn default() -> Self {
+            IndexValue::F64(0.0)
+        }
+    }
+
+    fn value_strategy() -> impl Strategy<Value = IndexValue> {
+        prop_oneof![
+            any::<f64>().prop_map(IndexValue::F64),
+            any::<u64>().prop_map(IndexValue::U64),
+            any::<i64>().prop_map(IndexValue::I64),
+            any::<String>().prop_map(IndexValue::Str),
+        ]
+    }

    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
        prop_oneof![
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            (0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            (0u64..20u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            ((0u64..20u64), (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1596,7 +1649,17 @@ mod tests {
        prop_oneof![
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            50 => (0u64..100u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1605,19 +1668,27 @@ mod tests {
    fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
        let mut existing_ids = HashMap::new();
        let mut deleted_ids = HashSet::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    *existing_ids.entry(id).or_insert(0) += 1;
-                    deleted_ids.remove(&id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    *existing_ids.entry(*id).or_insert(0) += 1;
+                    deleted_ids.remove(id);
+                }
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value: _,
+                } => {
+                    *existing_ids.entry(*id).or_insert(0) += num_docs;
+                    deleted_ids.remove(id);
                }
                IndexingOp::DeleteDoc { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                IndexingOp::DeleteDocQuery { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                _ => {}
            }
@@ -1627,16 +1698,19 @@ mod tests {

    fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
        let mut id_list = Vec::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    id_list.push(id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    id_list.push(*id);
+                }
+                IndexingOp::AddMultipleDoc { id, .. } => {
+                    id_list.push(*id);
                }
                IndexingOp::DeleteDoc { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                IndexingOp::DeleteDocQuery { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                _ => {}
            }
@@ -1717,42 +1791,59 @@ mod tests {

        let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);

-        for &op in ops {
-            match op {
-                IndexingOp::AddDoc { id } => {
-                    let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
-                    let ip = ip_from_id(id);
-
-                    if !id_is_full_doc(id) {
-                        // every 3rd doc has no ip field
-                        index_writer.add_document(doc!(
-                            id_field=>id,
-                        ))?;
-                    } else {
-                        let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
-                        index_writer.add_document(doc!(id_field=>id,
-                                json_field=>json,
-                                bytes_field => id.to_le_bytes().as_slice(),
-                                id_opt_field => id,
-                                ip_field => ip,
-                                ips_field => ip,
-                                ips_field => ip,
-                                multi_numbers=> id,
-                                multi_numbers => id,
-                                bool_field => (id % 2u64) != 0,
-                                i64_field => id as i64,
-                                f64_field => id as f64,
-                                date_field => DateTime::from_timestamp_secs(id as i64),
-                                multi_bools => (id % 2u64) != 0,
-                                multi_bools => (id % 2u64) == 0,
-                                text_field => id.to_string(),
-                                facet_field => facet,
-                                large_text_field => LOREM,
-                                multi_text_fields => multi_text_field_text1,
-                                multi_text_fields => multi_text_field_text2,
-                                multi_text_fields => multi_text_field_text3,
-                        ))?;
-                    }
+        let add_docs = |index_writer: &mut IndexWriter,
+                        id: u64,
+                        value: IndexValue,
+                        num: u64|
+         -> crate::Result<()> {
+            let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
+            let ip = ip_from_id(id);
+            let doc = if !id_is_full_doc(id) {
+                // every 3rd doc has no ip field
+                doc!(
+                    id_field=>id,
+                )
+            } else {
+                let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
+                doc!(id_field=>id,
+                        json_field=>json,
+                        bytes_field => id.to_le_bytes().as_slice(),
+                        id_opt_field => id,
+                        ip_field => ip,
+                        ips_field => ip,
+                        ips_field => ip,
+                        multi_numbers=> id,
+                        multi_numbers => id,
+                        bool_field => (id % 2u64) != 0,
+                        i64_field => id as i64,
+                        f64_field => id as f64,
+                        date_field => DateTime::from_timestamp_secs(id as i64),
+                        multi_bools => (id % 2u64) != 0,
+                        multi_bools => (id % 2u64) == 0,
+                        text_field => id.to_string(),
+                        facet_field => facet,
+                        large_text_field => LOREM,
+                        multi_text_fields => multi_text_field_text1,
+                        multi_text_fields => multi_text_field_text2,
+                        multi_text_fields => multi_text_field_text3,
+                )
+            };
+            for _ in 0..num {
+                index_writer.add_document(doc.clone())?;
+            }
+            Ok(())
+        };
+        for op in ops {
+            match op.clone() {
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value,
+                } => {
+                    add_docs(&mut index_writer, id, value, num_docs)?;
+                }
+                IndexingOp::AddDoc { id, value } => {
+                    add_docs(&mut index_writer, id, value, 1)?;
                }
                IndexingOp::DeleteDoc { id } => {
                    index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1980,7 +2071,13 @@ mod tests {
                .unwrap();
            // test store iterator
            for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
-                let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
+                let id = doc
+                    .unwrap()
+                    .get_first(id_field)
+                    .unwrap()
+                    .as_value()
+                    .as_u64()
+                    .unwrap();
                assert!(expected_ids_and_num_occurrences.contains_key(&id));
            }
            // test store random access
@@ -2013,7 +2110,7 @@ mod tests {
                    let mut bool2 = doc.get_all(multi_bools);
                    assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
                    assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
-                    assert_eq!(None, bool2.next())
+                    assert!(bool2.next().is_none())
                }
            }
        }
@@ -2027,18 +2124,22 @@ mod tests {

            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
        };
+        let count_search = |term: &str, field| {
+            let query = QueryParser::for_index(&index, vec![field])
+                .parse_query(term)
+                .unwrap();
+            searcher.search(&query, &Count).unwrap()
+        };

-        let do_search2 = |term: Term| {
+        let count_search2 = |term: Term| {
            let query = TermQuery::new(term, IndexRecordOption::Basic);
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
-
-            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
+            searcher.search(&query, &Count).unwrap()
        };

        for (id, count) in &expected_ids_and_num_occurrences {
+            // skip expensive queries
            let (existing_id, count) = (*id, *count);
-            let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
+            let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
            assert_eq!(get_num_hits(id_field), count);
            if !id_is_full_doc(existing_id) {
                continue;
@@ -2048,29 +2149,31 @@ mod tests {
            assert_eq!(get_num_hits(f64_field), count);

            // Test multi text
-            assert_eq!(
-                do_search("\"test1 test2\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
-            assert_eq!(
-                do_search("\"test2 test3\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
+            if num_docs_with_values < 1000 {
+                assert_eq!(
+                    do_search("\"test1 test2\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+                assert_eq!(
+                    do_search("\"test2 test3\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+            }

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test date
            let term = Term::from_field_date(
                date_field,
                DateTime::from_timestamp_secs(existing_id as i64),
            );
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);
        }
        for deleted_id in deleted_ids {
            let assert_field = |field| {
-                assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0);
+                assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
            };
            assert_field(text_field);
            assert_field(f64_field);
@@ -2079,12 +2182,12 @@ mod tests {

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);

            // Test date
            let term =
                Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);
        }
        // search ip address
        //
@@ -2093,13 +2196,13 @@ mod tests {
            if !id_is_full_doc(existing_id) {
                continue;
            }
-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
            // Test incoming ip as ipv6
            assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);

            let term = Term::from_field_ip_addr(ip_field, ip_addr);
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test incoming ip as ipv4
            if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2116,7 +2219,7 @@ mod tests {
        if !sample.is_empty() {
            let (left_sample, right_sample) = sample.split_at(sample.len() / 2);

-            let expected_count = |sample: &[(&u64, &u64)]| {
+            let calc_expected_count = |sample: &[(&u64, &u64)]| {
                sample
                    .iter()
                    .filter(|(id, _)| id_is_full_doc(**id))
@@ -2132,18 +2235,17 @@ mod tests {
            }

            // Query first half
-            if !left_sample.is_empty() {
-                let expected_count = expected_count(left_sample);
-
+            let expected_count = calc_expected_count(left_sample);
+            if !left_sample.is_empty() && expected_count < 1000 {
                let start_range = *left_sample[0].0;
                let end_range = *left_sample.last().unwrap().0;
                let query = gen_query_inclusive("id_opt", start_range, end_range);
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", "*", ip2);
@@ -2155,19 +2257,19 @@ mod tests {
                assert_eq!(do_search_ip_field(&query), expected_count);
            }
            // Query second half
-            if !right_sample.is_empty() {
-                let expected_count = expected_count(right_sample);
+            let expected_count = calc_expected_count(right_sample);
+            if !right_sample.is_empty() && expected_count < 1000 {
                let start_range = *right_sample[0].0;
                let end_range = *right_sample.last().unwrap().0;
                // Range query on id opt field
                let query =
                    gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", ip1, "*");
@@ -2192,7 +2294,7 @@ mod tests {
            };
            let ip = ip_from_id(existing_id);

-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            // Range query on single value field
            let query = gen_query_inclusive("ip", ip, ip);
            assert_eq!(do_search_ip_field(&query), count);
@@ -2252,7 +2354,7 @@ mod tests {

    #[test]
    fn test_fast_field_range() {
-        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
+        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
        assert!(test_operation_strategy(&ops, false, true).is_ok());
    }

@@ -2260,8 +2362,8 @@ mod tests {
    fn test_sort_index_on_opt_field_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 81 },
-                IndexingOp::AddDoc { id: 70 },
+                IndexingOp::add(81),
+                IndexingOp::add(70),
                IndexingOp::DeleteDoc { id: 70 }
            ],
            true,
@@ -2270,14 +2372,45 @@ mod tests {
        .is_ok());
    }

+    #[test]
+    fn test_simple_multiple_doc() {
+        assert!(test_operation_strategy(
+            &[
+                IndexingOp::AddMultipleDoc {
+                    id: 7,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 92,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 30,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 33,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+            ],
+            true,
+            false
+        )
+        .is_ok());
+    }
+
    #[test]
    fn test_ip_range_query_multivalue_bug() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 2 },
+                IndexingOp::add(2),
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 1 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::Merge
            ],
@@ -2291,11 +2424,11 @@ mod tests {
    fn test_ff_num_ips_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(13),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::DeleteDocQuery { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
                IndexingOp::Commit,
            ],
            false,
@@ -2307,7 +2440,7 @@ mod tests {
    #[test]
    fn test_minimal_sort_force_end_merge() {
        assert!(test_operation_strategy(
-            &[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },],
+            &[IndexingOp::add(23), IndexingOp::add(13),],
            false,
            false
        )
@@ -2368,8 +2501,8 @@ mod tests {
    fn test_minimal_sort_force_end_merge_with_delete() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            true,
@@ -2382,8 +2515,8 @@ mod tests {
    fn test_minimal_no_sort_no_force_end_merge() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            false,
@@ -2394,7 +2527,7 @@ mod tests {

    #[test]
    fn test_minimal_sort_merge() {
-        assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok());
+        assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
    }

    use proptest::prelude::*;
@@ -2490,14 +2623,14 @@ mod tests {
    fn test_delete_bug_reproduction_ip_addr() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 1 },
-            AddDoc { id: 2 },
+            IndexingOp::add(1),
+            IndexingOp::add(2),
            Commit,
-            AddDoc { id: 3 },
+            IndexingOp::add(3),
            DeleteDoc { id: 1 },
            Commit,
            Merge,
-            AddDoc { id: 4 },
+            IndexingOp::add(4),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2506,7 +2639,13 @@ mod tests {
    #[test]
    fn test_merge_regression_1() {
        use IndexingOp::*;
-        let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
+        let ops = &[
+            IndexingOp::add(15),
+            Commit,
+            IndexingOp::add(9),
+            Commit,
+            Merge,
+        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
    }

@@ -2514,9 +2653,9 @@ mod tests {
    fn test_range_query_bug_1() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 9 },
-            AddDoc { id: 0 },
-            AddDoc { id: 13 },
+            IndexingOp::add(9),
+            IndexingOp::add(0),
+            IndexingOp::add(13),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2524,12 +2663,11 @@ mod tests {

    #[test]
    fn test_range_query_bug_2() {
-        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 3 },
-            AddDoc { id: 6 },
-            AddDoc { id: 9 },
-            AddDoc { id: 10 },
+            IndexingOp::add(3),
+            IndexingOp::add(6),
+            IndexingOp::add(9),
+            IndexingOp::add(10),
        ];
        test_operation_strategy(&ops[..], false, false).unwrap();
    }
@@ -2551,7 +2689,7 @@ mod tests {
        assert!(test_operation_strategy(
            &[
                IndexingOp::DeleteDoc { id: 0 },
-                IndexingOp::AddDoc { id: 6 },
+                IndexingOp::add(6),
                IndexingOp::DeleteDocQuery { id: 11 },
                IndexingOp::Commit,
                IndexingOp::Merge,
@@ -2568,10 +2706,13 @@ mod tests {
    fn test_bug_1617_2() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::AddDoc {
+                    id: 13,
+                    value: Default::default()
+                },
                IndexingOp::DeleteDoc { id: 13 },
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 30 },
+                IndexingOp::add(30),
                IndexingOp::Commit,
                IndexingOp::Merge,
            ],
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -787,6 +787,8 @@ impl IndexMerger {
 mod tests {

    use columnar::Column;
+    use proptest::prop_oneof;
+    use proptest::strategy::Strategy;
    use schema::FAST;

    use crate::collector::tests::{
@@ -794,11 +796,11 @@ mod tests {
    };
    use crate::collector::{Count, FacetCollector};
    use crate::index::{Index, SegmentId};
+    use crate::indexer::NoMergePolicy;
    use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
-    use crate::schema::document::Value;
    use crate::schema::{
        Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
-        TextFieldIndexing, INDEXED, TEXT,
+        TextFieldIndexing, Value, INDEXED, TEXT,
    };
    use crate::time::OffsetDateTime;
    use crate::{
@@ -910,15 +912,24 @@ mod tests {
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("af b")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("a b c")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
-                assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
+                assert_eq!(
+                    doc.get_first(text_field).unwrap().as_value().as_str(),
+                    Some("a b c d")
+                );
            }
            {
                let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
@@ -1523,6 +1534,112 @@ mod tests {
        Ok(())
    }

+    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
+    enum IndexingOp {
+        ZeroVal,
+        OneVal { val: u64 },
+        TwoVal { val: u64 },
+        Commit,
+    }
+
+    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
+        prop_oneof![
+            (0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
+            (0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
+            (0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
+            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
+        ]
+    }
+
+    use proptest::prelude::*;
+    proptest! {
+        #[test]
+        fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
+            assert!(test_merge_int_fields(&ops[..]).is_ok());
+        }
+    }
+    fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
+        if ops.iter().all(|op| *op == IndexingOp::Commit) {
+            return Ok(());
+        }
+        let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
+            .iter()
+            .filter(|op| *op != &IndexingOp::Commit)
+            .map(|op| match op {
+                IndexingOp::ZeroVal => vec![],
+                IndexingOp::OneVal { val } => vec![*val],
+                IndexingOp::TwoVal { val } => vec![*val, *val],
+                IndexingOp::Commit => unreachable!(),
+            })
+            .enumerate()
+            .map(|(id, val)| (id as u32, val))
+            .collect();
+
+        let mut schema_builder = schema::Schema::builder();
+        let int_options = NumericOptions::default().set_fast().set_indexed();
+        let int_field = schema_builder.add_u64_field("intvals", int_options);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut index_writer = index.writer_for_tests()?;
+            index_writer.set_merge_policy(Box::new(NoMergePolicy));
+            let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
+                let mut doc = TantivyDocument::default();
+                for &val in int_vals {
+                    doc.add_u64(int_field, val);
+                }
+                index_writer.add_document(doc).unwrap();
+            };
+
+            for op in ops {
+                match op {
+                    IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
+                    IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
+                    IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
+                    IndexingOp::Commit => {
+                        index_writer.commit().expect("commit failed");
+                    }
+                }
+            }
+            index_writer.commit().expect("commit failed");
+        }
+        {
+            let mut segment_ids = index.searchable_segment_ids()?;
+            segment_ids.sort();
+            let mut index_writer: IndexWriter = index.writer_for_tests()?;
+            index_writer.merge(&segment_ids).wait()?;
+            index_writer.wait_merging_threads()?;
+        }
+        let reader = index.reader()?;
+        reader.reload()?;
+
+        let mut vals: Vec<u64> = Vec::new();
+        let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
+            vals.clear();
+            vals.extend(col.values_for_doc(doc));
+            assert_eq!(&vals[..], expected);
+        };
+
+        let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
+            for (doc_id, vals) in column_expected.iter() {
+                test_vals(col, *doc_id, vals);
+            }
+        };
+
+        {
+            let searcher = reader.searcher();
+            let segment = searcher.segment_reader(0u32);
+            let col = segment
+                .fast_fields()
+                .column_opt::<u64>("intvals")
+                .unwrap()
+                .unwrap();
+
+            test_col(&col, &expected_doc_and_vals);
+        }
+
+        Ok(())
+    }
+
    #[test]
    fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
        let mut schema_builder = schema::Schema::builder();
--- a/src/indexer/merger_sorted_index_test.rs
+++ b/src/indexer/merger_sorted_index_test.rs
@@ -5,10 +5,9 @@ mod tests {
    use crate::index::Index;
    use crate::postings::Postings;
    use crate::query::QueryParser;
-    use crate::schema::document::Value;
    use crate::schema::{
        self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
-        TextFieldIndexing, TextOptions,
+        TextFieldIndexing, TextOptions, Value,
    };
    use crate::{
        DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
@@ -281,13 +280,16 @@ mod tests {
                .doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
                .unwrap();
            assert_eq!(
-                doc.get_first(my_text_field).unwrap().as_str(),
+                doc.get_first(my_text_field).unwrap().as_value().as_str(),
                Some("blubber")
            );
            let doc = searcher
                .doc::<TantivyDocument>(DocAddress::new(0, 0))
                .unwrap();
-            assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
+            assert_eq!(
+                doc.get_first(int_field).unwrap().as_value().as_u64(),
+                Some(1000)
+            );
        }
    }

--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -216,7 +216,7 @@ mod tests_mmap {
        let test_query = |query_str: &str| {
            let query = parse_query.parse_query(query_str).unwrap();
            let num_docs = searcher.search(&query, &Count).unwrap();
-            assert_eq!(num_docs, 1, "{}", query_str);
+            assert_eq!(num_docs, 1, "{query_str}");
        };
        test_query(format!("json.{field_name_out}:test1").as_str());
        test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -590,10 +590,10 @@ mod tests_mmap {
        let query_parser = QueryParser::for_index(&index, vec![]);
        // Test if field name can be queried
        for (indexed_field, val) in fields_and_vals.iter() {
-            let query_str = &format!("{}:{}", indexed_field, val);
+            let query_str = &format!("{indexed_field}:{val}");
            let query = query_parser.parse_query(query_str).unwrap();
            let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
-            assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
+            assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
        }
        // Test if field name can be used for aggregation
        for (field_name, val) in fields_and_vals.iter() {
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -202,9 +202,8 @@ impl SegmentWriter {
            match field_entry.field_type() {
                FieldType::Facet(_) => {
                    let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
                        let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
                }
                FieldType::Str(_) => {
                    let mut indexing_position = IndexingPosition::default();
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let mut token_stream = if let Some(text) = value.as_str() {
                            let text_analyzer =
                                &mut self.per_field_text_analyzers[field.field_id() as usize];
                            text_analyzer.token_stream(text)
-                        } else if let Some(tok_str) = value.as_pre_tokenized_text() {
+                        } else if let Some(tok_str) = value.into_pre_tokenized_text() {
                            BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
                        } else {
                            continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
                }
                FieldType::U64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
                }
                FieldType::Date(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value_access = value_access as D::Value<'_>;
-                        let value = value_access.as_value();
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
                }
                FieldType::I64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
                }
                FieldType::F64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
                        term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
                }
                FieldType::Bool(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
                        term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
                }
                FieldType::Bytes(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
                        term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
                }
                FieldType::IpAddr(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -499,10 +486,9 @@ mod tests {
    use crate::fastfield::FastValue;
    use crate::postings::{Postings, TermInfo};
    use crate::query::{PhraseQuery, QueryParser};
-    use crate::schema::document::Value;
    use crate::schema::{
-        Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
-        STRING, TEXT,
+        Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
+        STORED, STRING, TEXT,
    };
    use crate::store::{Compressor, StoreReader, StoreWriter};
    use crate::time::format_description::well_known::Rfc3339;
@@ -555,9 +541,15 @@ mod tests {
        let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
        let doc = reader.get::<TantivyDocument>(0).unwrap();

-        assert_eq!(doc.field_values().len(), 2);
-        assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
-        assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
+        assert_eq!(doc.field_values().count(), 2);
+        assert_eq!(
+            doc.get_all(text_field).next().unwrap().as_value().as_str(),
+            Some("A")
+        );
+        assert_eq!(
+            doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
+            Some("title")
+        );
    }
    #[test]
    fn test_simple_json_indexing() {
@@ -641,7 +633,7 @@ mod tests {
        let mut schema_builder = Schema::builder();
        let json_field = schema_builder.add_json_field("json", STORED | TEXT);
        let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
            r#"{
            "toto": "titi",
            "float": -0.2,
@@ -669,14 +661,10 @@ mod tests {
                doc_id: 0u32,
            })
            .unwrap();
-        let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
-            &doc.to_json(&schema),
-        )
-        .unwrap()
-        .get("json")
-        .unwrap()[0]
-            .as_object()
+        let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
            .unwrap()
+            .get("json")
+            .unwrap()[0]
            .clone();
        assert_eq!(json_val, serdeser_json_val);
        let segment_reader = searcher.segment_reader(0u32);
@@ -840,7 +828,7 @@ mod tests {
        let mut schema_builder = Schema::builder();
        let json_field = schema_builder.add_json_field("json", STRING);
        let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> =
+        let json_val: serde_json::Value =
            serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
        let doc = doc!(json_field=>json_val);
        let index = Index::create_in_ram(schema);
@@ -880,7 +868,7 @@ mod tests {
        let mut schema_builder = Schema::builder();
        let json_field = schema_builder.add_json_field("json", TEXT);
        let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
            r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
        )
        .unwrap();
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -397,16 +397,20 @@ pub mod tests {
    #[macro_export]
    macro_rules! assert_nearly_equals {
        ($left:expr, $right:expr) => {{
-            match (&$left, &$right) {
-                (left_val, right_val) => {
+            assert_nearly_equals!($left, $right, 0.0005);
+        }};
+        ($left:expr, $right:expr, $epsilon:expr) => {{
+            match (&$left, &$right, &$epsilon) {
+                (left_val, right_val, epsilon_val) => {
                    let diff = (left_val - right_val).abs();
-                    let add = left_val.abs() + right_val.abs();
-                    if diff > 0.0005 * add {
+
+                    if diff > *epsilon_val {
                        panic!(
-                            r#"assertion failed: `(left ~= right)`
-  left: `{:?}`,
- right: `{:?}`"#,
-                            &*left_val, &*right_val
+                            r#"assertion failed: `abs(left-right)>epsilon`
+    left: `{:?}`,
+    right: `{:?}`,
+    epsilon: `{:?}`"#,
+                            &*left_val, &*right_val, &*epsilon_val
                        )
                    }
                }
@@ -935,7 +939,7 @@ pub mod tests {
        let mut schema_builder = Schema::builder();
        let json_field = schema_builder.add_json_field("json", STORED | TEXT);
        let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
            r#"{
            "signed": 2,
            "float": 2.0,
@@ -1025,13 +1029,16 @@ pub mod tests {
                            text_field => "some other value",
                            other_text_field => "short");
        assert_eq!(document.len(), 3);
-        let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
+        let values: Vec<OwnedValue> = document.get_all(text_field).map(OwnedValue::from).collect();
        assert_eq!(values.len(), 2);
-        assert_eq!(values[0].as_str(), Some("tantivy"));
-        assert_eq!(values[1].as_str(), Some("some other value"));
-        let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
+        assert_eq!(values[0].as_ref().as_str(), Some("tantivy"));
+        assert_eq!(values[1].as_ref().as_str(), Some("some other value"));
+        let values: Vec<OwnedValue> = document
+            .get_all(other_text_field)
+            .map(OwnedValue::from)
+            .collect();
        assert_eq!(values.len(), 1);
-        assert_eq!(values[0].as_str(), Some("short"));
+        assert_eq!(values[0].as_ref().as_str(), Some("short"));
    }

    #[test]
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -41,6 +41,7 @@
 /// );
 /// # }
 /// ```
+
 #[macro_export]
 macro_rules! doc(
    () => {
@@ -52,7 +53,7 @@ macro_rules! doc(
        {
            let mut document = $crate::TantivyDocument::default();
            $(
-                document.add_field_value($field, $value);
+                document.add_field_value($field, &$value);
            )*
            document
        }
--- a/src/query/fuzzy_query.rs
+++ b/src/query/fuzzy_query.rs
@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
                if json_path_type != Type::Str {
                    return Err(InvalidArgument(format!(
                        "The fuzzy term query requires a string path type for a json term. Found \
-                         {:?}",
-                        json_path_type
+                         {json_path_type:?}"
                    )));
                }
            }
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -2,7 +2,7 @@ use std::fmt;
 use std::ops::Bound;

 use crate::query::Occur;
-use crate::schema::{Field, Term, Type};
+use crate::schema::{Term, Type};
 use crate::Score;

 #[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
        upper: Bound<Term>,
    },
    Set {
-        field: Field,
-        value_type: Type,
        elements: Vec<Term>,
    },
    All,
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -832,17 +832,11 @@ impl QueryParser {
                let (field, json_path) = try_tuple!(self
                    .split_full_path(&full_path)
                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
-                let field_entry = self.schema.get_field_entry(field);
-                let value_type = field_entry.field_type().value_type();
                let (elements, errors) = elements
                    .into_iter()
                    .map(|element| self.compute_boundary_term(field, json_path, &element))
                    .partition_result();
-                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
-                    elements,
-                    field,
-                    value_type,
-                }));
+                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
                (Some(logical_ast), errors)
            }
            UserInputLeaf::Exists { .. } => (
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -185,7 +185,7 @@ mod test {
            Err(crate::TantivyError::InvalidArgument(msg)) => {
                assert!(msg.contains("error: unclosed group"))
            }
-            res => panic!("unexpected result: {:?}", res),
+            res => panic!("unexpected result: {res:?}"),
        }
    }
 }
--- a/src/schema/document/default_document.rs
+++ b/src/schema/document/default_document.rs
@@ -1,93 +1,64 @@
 use std::collections::{BTreeMap, HashMap, HashSet};
+use std::io::{self, Read, Write};
 use std::net::Ipv6Addr;

-use common::DateTime;
+use columnar::MonotonicallyMappableToU128;
+use common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, DateTime, VInt};
 use serde_json::Map;
+pub use CompactDoc as TantivyDocument;

+use super::{ReferenceValue, ReferenceValueLeaf, Value};
 use crate::schema::document::{
    DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
 };
 use crate::schema::field_type::ValueParsingError;
-use crate::schema::field_value::FieldValueIter;
-use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema};
+use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema};
 use crate::tokenizer::PreTokenizedString;

-/// TantivyDocument provides a default implementation of the `Document` trait.
-/// It is the object that can be indexed and then searched for.
-///
-/// Documents are fundamentally a collection of unordered couples `(field, value)`.
-/// In this list, one field may appear more than once.
-#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
-pub struct TantivyDocument {
-    field_values: Vec<FieldValue>,
+#[repr(packed)]
+#[derive(Debug, Clone)]
+/// A field value pair in the compact tantivy document
+struct FieldValueAddr {
+    pub field: u16,
+    pub value_addr: ValueAddr,
 }

-impl Document for TantivyDocument {
-    type Value<'a> = &'a OwnedValue;
-    type FieldsValuesIter<'a> = FieldValueIter<'a>;
+#[derive(Debug, Clone)]
+/// The default document in tantivy. It encodes data in a compact form.
+pub struct CompactDoc {
+    /// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
+    /// includes all the data of the document and also metadata like where the nodes are located
+    /// in an object or array.
+    pub node_data: Vec<u8>,
+    /// The root (Field, Value) pairs
+    field_values: Vec<FieldValueAddr>,
+}

-    fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
-        FieldValueIter(self.field_values.iter())
+impl Default for CompactDoc {
+    fn default() -> Self {
+        Self::new()
    }
 }

-impl DocumentDeserialize for TantivyDocument {
-    fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
-    where D: DocumentDeserializer<'de> {
-        let mut field_values = Vec::with_capacity(deserializer.size_hint());
-
-        while let Some((field, value)) = deserializer.next_field()? {
-            field_values.push(FieldValue::new(field, value));
-        }
-
-        Ok(Self { field_values })
-    }
-}
-
-impl From<Vec<FieldValue>> for TantivyDocument {
-    fn from(field_values: Vec<FieldValue>) -> Self {
-        Self { field_values }
-    }
-}
-
-impl PartialEq for TantivyDocument {
-    fn eq(&self, other: &Self) -> bool {
-        // super slow, but only here for tests
-        let convert_to_comparable_map = |field_values: &[FieldValue]| {
-            let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
-            for field_value in field_values.iter() {
-                let value = serde_json::to_string(field_value.value()).unwrap();
-                field_value_set
-                    .entry(field_value.field())
-                    .or_default()
-                    .insert(value);
-            }
-            field_value_set
-        };
-        let self_field_values: HashMap<Field, HashSet<String>> =
-            convert_to_comparable_map(&self.field_values);
-        let other_field_values: HashMap<Field, HashSet<String>> =
-            convert_to_comparable_map(&other.field_values);
-        self_field_values.eq(&other_field_values)
-    }
-}
-
-impl Eq for TantivyDocument {}
-
-impl IntoIterator for TantivyDocument {
-    type Item = FieldValue;
-
-    type IntoIter = std::vec::IntoIter<FieldValue>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.field_values.into_iter()
-    }
-}
-
-impl TantivyDocument {
+impl CompactDoc {
    /// Creates a new, empty document object
-    pub fn new() -> TantivyDocument {
-        TantivyDocument::default()
+    /// The reserved capacity is for the total serialized data
+    pub fn with_capacity(bytes: usize) -> CompactDoc {
+        CompactDoc {
+            node_data: Vec::with_capacity(bytes),
+            field_values: Vec::with_capacity(4),
+        }
+    }
+
+    /// Creates a new, empty document object
+    pub fn new() -> CompactDoc {
+        CompactDoc::with_capacity(1024)
+    }
+
+    /// Skrinks the capacity of the document to fit the data
+    pub fn shrink_to_fit(&mut self) {
+        self.node_data.shrink_to_fit();
+        self.field_values.shrink_to_fit();
    }

    /// Returns the length of the document.
@@ -99,83 +70,111 @@ impl TantivyDocument {
    pub fn add_facet<F>(&mut self, field: Field, path: F)
    where Facet: From<F> {
        let facet = Facet::from(path);
-        let value = OwnedValue::Facet(facet);
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, ReferenceValueLeaf::Facet(facet.encoded_str()));
    }

    /// Add a text field.
-    pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
-        let value = OwnedValue::Str(text.to_string());
-        self.add_field_value(field, value);
+    pub fn add_text<S: AsRef<str>>(&mut self, field: Field, text: S) {
+        self.add_leaf_field_value(field, ReferenceValueLeaf::Str(text.as_ref()));
    }

    /// Add a pre-tokenized text field.
    pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
-        self.add_field_value(field, pre_tokenized_text);
+        self.add_leaf_field_value(field, pre_tokenized_text);
    }

    /// Add a u64 field
    pub fn add_u64(&mut self, field: Field, value: u64) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a IP address field. Internally only Ipv6Addr is used.
    pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a i64 field
    pub fn add_i64(&mut self, field: Field, value: i64) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a f64 field
    pub fn add_f64(&mut self, field: Field, value: f64) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a bool field
    pub fn add_bool(&mut self, field: Field, value: bool) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a date field with unspecified time zone offset
    pub fn add_date(&mut self, field: Field, value: DateTime) {
-        self.add_field_value(field, value);
+        self.add_leaf_field_value(field, value);
    }

    /// Add a bytes field
-    pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
-        self.add_field_value(field, value.into());
+    pub fn add_bytes(&mut self, field: Field, value: &[u8]) {
+        self.add_leaf_field_value(field, value);
    }

    /// Add a dynamic object field
    pub fn add_object(&mut self, field: Field, object: BTreeMap<String, OwnedValue>) {
-        self.add_field_value(field, object);
+        self.add_field_value(field, &OwnedValue::from(object));
    }

    /// Add a (field, value) to the document.
-    pub fn add_field_value<T: Into<OwnedValue>>(&mut self, field: Field, typed_val: T) {
+    ///
+    /// `OwnedValue` implements Value, which should be easiest to use, but is not the most
+    /// performant.
+    pub fn add_field_value<'a, V: Value<'a>>(&mut self, field: Field, value: V) {
+        let field_value = FieldValueAddr {
+            field: field
+                .field_id()
+                .try_into()
+                .expect("support only up to u16::MAX field ids"),
+            value_addr: self.add_value(value),
+        };
+        self.field_values.push(field_value);
+    }
+
+    /// Add a (field, leaf value) to the document.
+    /// Leaf values don't have nested values.
+    pub fn add_leaf_field_value<'a, T: Into<ReferenceValueLeaf<'a>>>(
+        &mut self,
+        field: Field,
+        typed_val: T,
+    ) {
        let value = typed_val.into();
-        let field_value = FieldValue { field, value };
+        let field_value = FieldValueAddr {
+            field: field
+                .field_id()
+                .try_into()
+                .expect("support only up to u16::MAX field ids"),
+            value_addr: self.add_value_leaf(value),
+        };
        self.field_values.push(field_value);
    }

    /// field_values accessor
-    pub fn field_values(&self) -> &[FieldValue] {
-        &self.field_values
+    pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
+        self.field_values.iter().map(|field_val| {
+            let field = Field::from_field_id(field_val.field as u32);
+            let val = self.get_compact_doc_value(field_val.value_addr);
+            (field, val)
+        })
    }

-    /// Returns all of the `FieldValue`s associated the given field
-    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &OwnedValue> {
+    /// Returns all of the `ReferenceValue`s associated the given field
+    pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
        self.field_values
            .iter()
-            .filter(move |field_value| field_value.field() == field)
-            .map(FieldValue::value)
+            .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
+            .map(|val| self.get_compact_doc_value(val.value_addr))
    }

-    /// Returns the first `FieldValue` associated the given field
-    pub fn get_first(&self, field: Field) -> Option<&OwnedValue> {
+    /// Returns the first `ReferenceValue` associated the given field
+    pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
        self.get_all(field).next()
    }

@@ -183,12 +182,12 @@ impl TantivyDocument {
    pub fn convert_named_doc(
        schema: &Schema,
        named_doc: NamedFieldDocument,
-    ) -> Result<TantivyDocument, DocParsingError> {
-        let mut document = TantivyDocument::new();
+    ) -> Result<Self, DocParsingError> {
+        let mut document = Self::new();
        for (field_name, values) in named_doc.0 {
            if let Ok(field) = schema.get_field(&field_name) {
                for value in values {
-                    document.add_field_value(field, value);
+                    document.add_field_value(field, &value);
                }
            }
        }
@@ -196,7 +195,7 @@ impl TantivyDocument {
    }

    /// Build a document object from a json-object.
-    pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<TantivyDocument, DocParsingError> {
+    pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<Self, DocParsingError> {
        let json_obj: Map<String, serde_json::Value> =
            serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
        Self::from_json_object(schema, json_obj)
@@ -206,8 +205,8 @@ impl TantivyDocument {
    pub fn from_json_object(
        schema: &Schema,
        json_obj: Map<String, serde_json::Value>,
-    ) -> Result<TantivyDocument, DocParsingError> {
-        let mut doc = TantivyDocument::default();
+    ) -> Result<Self, DocParsingError> {
+        let mut doc = Self::default();
        for (field_name, json_value) in json_obj {
            if let Ok(field) = schema.get_field(&field_name) {
                let field_entry = schema.get_field_entry(field);
@@ -218,20 +217,482 @@ impl TantivyDocument {
                            let value = field_type
                                .value_from_json(json_item)
                                .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
-                            doc.add_field_value(field, value);
+                            doc.add_field_value(field, &value);
                        }
                    }
                    _ => {
                        let value = field_type
                            .value_from_json(json_value)
                            .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
-                        doc.add_field_value(field, value);
+                        doc.add_field_value(field, &value);
                    }
                }
            }
        }
        Ok(doc)
    }
+
+    fn add_value_leaf(&mut self, leaf: ReferenceValueLeaf) -> ValueAddr {
+        let type_id = ValueType::from(&leaf);
+        // Write into `node_data` and return u32 position as its address
+        // Null and bool are inlined into the address
+        let val_addr = match leaf {
+            ReferenceValueLeaf::Null => 0,
+            ReferenceValueLeaf::Str(bytes) => {
+                write_bytes_into(&mut self.node_data, bytes.as_bytes())
+            }
+            ReferenceValueLeaf::Facet(bytes) => {
+                write_bytes_into(&mut self.node_data, bytes.as_bytes())
+            }
+            ReferenceValueLeaf::Bytes(bytes) => write_bytes_into(&mut self.node_data, bytes),
+            ReferenceValueLeaf::U64(num) => write_into(&mut self.node_data, num),
+            ReferenceValueLeaf::I64(num) => write_into(&mut self.node_data, num),
+            ReferenceValueLeaf::F64(num) => write_into(&mut self.node_data, num),
+            ReferenceValueLeaf::Bool(b) => b as u32,
+            ReferenceValueLeaf::Date(date) => {
+                write_into(&mut self.node_data, date.into_timestamp_nanos())
+            }
+            ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()),
+            ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok),
+        };
+        ValueAddr { type_id, val_addr }
+    }
+    /// Adds a value and returns in address into the
+    fn add_value<'a, V: Value<'a>>(&mut self, value: V) -> ValueAddr {
+        let value = value.as_value();
+        let type_id = ValueType::from(&value);
+        match value {
+            ReferenceValue::Leaf(leaf) => self.add_value_leaf(leaf),
+            ReferenceValue::Array(elements) => {
+                // addresses of the elements in node_data
+                // Reusing a vec would be nicer, but it's not easy because of the recursion
+                // A global vec would work if every writer get it's discriminator
+                let mut addresses = Vec::new();
+                for elem in elements {
+                    let value_addr = self.add_value(elem);
+                    write_into(&mut addresses, value_addr);
+                }
+                ValueAddr {
+                    type_id,
+                    val_addr: write_bytes_into(&mut self.node_data, &addresses),
+                }
+            }
+            ReferenceValue::Object(entries) => {
+                // addresses of the elements in node_data
+                let mut addresses = Vec::new();
+                for (key, value) in entries {
+                    let key_addr = self.add_value_leaf(ReferenceValueLeaf::Str(key));
+                    let value_addr = self.add_value(value);
+                    write_into(&mut addresses, key_addr);
+                    write_into(&mut addresses, value_addr);
+                }
+                ValueAddr {
+                    type_id,
+                    val_addr: write_bytes_into(&mut self.node_data, &addresses),
+                }
+            }
+        }
+    }
+
+    /// Get CompactDocValue for address
+    fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
+        CompactDocValue {
+            container: self,
+            value_addr,
+        }
+    }
+
+    /// get &[u8] reference from node_data
+    fn extract_bytes(&self, addr: Addr) -> &[u8] {
+        binary_deserialize_bytes(self.get_slice(addr))
+    }
+
+    /// get &str reference from node_data
+    fn extract_str(&self, addr: Addr) -> &str {
+        let data = self.extract_bytes(addr);
+        // Utf-8 checks would have a noticeable performance overhead here
+        unsafe { std::str::from_utf8_unchecked(data) }
+    }
+
+    /// deserialized owned value from node_data
+    fn read_from<T: BinarySerializable>(&self, addr: Addr) -> io::Result<T> {
+        let data_slice = &self.node_data[addr as usize..];
+        let mut cursor = std::io::Cursor::new(data_slice);
+        T::deserialize(&mut cursor)
+    }
+
+    /// get slice from address. The returned slice is open ended
+    fn get_slice(&self, addr: Addr) -> &[u8] {
+        &self.node_data[addr as usize..]
+    }
+}
+
+/// BinarySerializable alternative to read references
+fn binary_deserialize_bytes(data: &[u8]) -> &[u8] {
+    let (len, bytes_read) = read_u32_vint_no_advance(data);
+    &data[bytes_read..bytes_read + len as usize]
+}
+
+/// Write bytes and return the position of the written data.
+///
+/// BinarySerializable alternative to write references
+fn write_bytes_into(vec: &mut Vec<u8>, data: &[u8]) -> u32 {
+    let pos = vec.len() as u32;
+    let mut buf = [0u8; 8];
+    let len_vint_bytes = serialize_vint_u32(data.len() as u32, &mut buf);
+    vec.extend_from_slice(len_vint_bytes);
+    vec.extend_from_slice(data);
+    pos
+}
+
+/// Serialize and return the position
+fn write_into<T: BinarySerializable>(vec: &mut Vec<u8>, value: T) -> u32 {
+    let pos = vec.len() as u32;
+    value.serialize(vec).unwrap();
+    pos
+}
+
+impl PartialEq for CompactDoc {
+    fn eq(&self, other: &Self) -> bool {
+        // super slow, but only here for tests
+        let convert_to_comparable_map = |doc: &CompactDoc| {
+            let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
+            for field_value in doc.field_values.iter() {
+                let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
+                let value = serde_json::to_string(&value).unwrap();
+                field_value_set
+                    .entry(Field::from_field_id(field_value.field as u32))
+                    .or_default()
+                    .insert(value);
+            }
+            field_value_set
+        };
+        let self_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(self);
+        let other_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(other);
+        self_field_values.eq(&other_field_values)
+    }
+}
+
+impl Eq for CompactDoc {}
+
+impl DocumentDeserialize for CompactDoc {
+    fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
+    where D: DocumentDeserializer<'de> {
+        let mut doc = CompactDoc::default();
+        // TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work
+        // on slices and referenced data.
+        while let Some((field, value)) = deserializer.next_field::<OwnedValue>()? {
+            doc.add_field_value(field, &value);
+        }
+        Ok(doc)
+    }
+}
+
+/// A value of Compact Doc needs a reference to the container to extract its payload
+#[derive(Debug, Clone, Copy)]
+pub struct CompactDocValue<'a> {
+    container: &'a CompactDoc,
+    value_addr: ValueAddr,
+}
+impl PartialEq for CompactDocValue<'_> {
+    fn eq(&self, other: &Self) -> bool {
+        let value1: OwnedValue = (*self).into();
+        let value2: OwnedValue = (*other).into();
+        value1 == value2
+    }
+}
+impl<'a> From<CompactDocValue<'a>> for OwnedValue {
+    fn from(value: CompactDocValue) -> Self {
+        value.as_value().into()
+    }
+}
+impl<'a> Value<'a> for CompactDocValue<'a> {
+    type ArrayIter = CompactDocArrayIter<'a>;
+
+    type ObjectIter = CompactDocObjectIter<'a>;
+
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        self.get_ref_value().unwrap()
+    }
+}
+impl<'a> CompactDocValue<'a> {
+    fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
+        let addr = self.value_addr.val_addr;
+        match self.value_addr.type_id {
+            ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
+            ValueType::Str => {
+                let str_ref = self.container.extract_str(addr);
+                Ok(ReferenceValueLeaf::Str(str_ref).into())
+            }
+            ValueType::Facet => {
+                let str_ref = self.container.extract_str(addr);
+                Ok(ReferenceValueLeaf::Facet(str_ref).into())
+            }
+            ValueType::Bytes => {
+                let data = self.container.extract_bytes(addr);
+                Ok(ReferenceValueLeaf::Bytes(data).into())
+            }
+            ValueType::U64 => self
+                .container
+                .read_from::<u64>(addr)
+                .map(ReferenceValueLeaf::U64)
+                .map(Into::into),
+            ValueType::I64 => self
+                .container
+                .read_from::<i64>(addr)
+                .map(ReferenceValueLeaf::I64)
+                .map(Into::into),
+            ValueType::F64 => self
+                .container
+                .read_from::<f64>(addr)
+                .map(ReferenceValueLeaf::F64)
+                .map(Into::into),
+            ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
+            ValueType::Date => self
+                .container
+                .read_from::<i64>(addr)
+                .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
+                .map(Into::into),
+            ValueType::IpAddr => self
+                .container
+                .read_from::<u128>(addr)
+                .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
+                .map(Into::into),
+            ValueType::PreTokStr => self
+                .container
+                .read_from::<PreTokenizedString>(addr)
+                .map(Into::into)
+                .map(ReferenceValueLeaf::PreTokStr)
+                .map(Into::into),
+            ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
+                self.container,
+                addr,
+            )?)),
+            ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
+                self.container,
+                addr,
+            )?)),
+        }
+    }
+}
+
+/// The address in the vec
+type Addr = u32;
+
+#[derive(Clone, Copy, Default)]
+#[repr(packed)]
+/// The value type and the address to its payload in the container.
+struct ValueAddr {
+    type_id: ValueType,
+    /// This is the address to the value in the vec, except for bool and null, which are inlined
+    val_addr: Addr,
+}
+impl BinarySerializable for ValueAddr {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        self.type_id.serialize(writer)?;
+        VInt(self.val_addr as u64).serialize(writer)
+    }
+
+    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        let type_id = ValueType::deserialize(reader)?;
+        let val_addr = VInt::deserialize(reader)?.0 as u32;
+        Ok(ValueAddr { type_id, val_addr })
+    }
+}
+impl std::fmt::Debug for ValueAddr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let val_addr = self.val_addr;
+        f.write_fmt(format_args!("{:?} at {:?}", self.type_id, val_addr))
+    }
+}
+
+/// A enum representing a value for tantivy to index.
+///
+/// Any changes need to be reflected in `BinarySerializable` for `ValueType`
+///
+/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing
+/// some items like Array and PreTokStr.
+#[derive(Default, Clone, Copy, Debug, PartialEq)]
+#[repr(u8)]
+pub enum ValueType {
+    /// A null value.
+    #[default]
+    Null = 0,
+    /// The str type is used for any text information.
+    Str = 1,
+    /// Unsigned 64-bits Integer `u64`
+    U64 = 2,
+    /// Signed 64-bits Integer `i64`
+    I64 = 3,
+    /// 64-bits Float `f64`
+    F64 = 4,
+    /// Date/time with nanoseconds precision
+    Date = 5,
+    /// Facet
+    Facet = 6,
+    /// Arbitrarily sized byte array
+    Bytes = 7,
+    /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
+    IpAddr = 8,
+    /// Bool value
+    Bool = 9,
+    /// Pre-tokenized str type,
+    PreTokStr = 10,
+    /// Object
+    Object = 11,
+    /// Pre-tokenized str type,
+    Array = 12,
+}
+
+impl BinarySerializable for ValueType {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        (*self as u8).serialize(writer)?;
+        Ok(())
+    }
+
+    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        let num = u8::deserialize(reader)?;
+        let type_id = if (0..=12).contains(&num) {
+            unsafe { std::mem::transmute(num) }
+        } else {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Invalid value type id: {num}"),
+            ));
+        };
+        Ok(type_id)
+    }
+}
+
+impl<'a, V: Value<'a>> From<&ReferenceValue<'a, V>> for ValueType {
+    fn from(value: &ReferenceValue<'a, V>) -> Self {
+        match value {
+            ReferenceValue::Leaf(leaf) => leaf.into(),
+            ReferenceValue::Array(_) => ValueType::Array,
+            ReferenceValue::Object(_) => ValueType::Object,
+        }
+    }
+}
+impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType {
+    fn from(value: &ReferenceValueLeaf<'a>) -> Self {
+        match value {
+            ReferenceValueLeaf::Null => ValueType::Null,
+            ReferenceValueLeaf::Str(_) => ValueType::Str,
+            ReferenceValueLeaf::U64(_) => ValueType::U64,
+            ReferenceValueLeaf::I64(_) => ValueType::I64,
+            ReferenceValueLeaf::F64(_) => ValueType::F64,
+            ReferenceValueLeaf::Bool(_) => ValueType::Bool,
+            ReferenceValueLeaf::Date(_) => ValueType::Date,
+            ReferenceValueLeaf::IpAddr(_) => ValueType::IpAddr,
+            ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr,
+            ReferenceValueLeaf::Facet(_) => ValueType::Facet,
+            ReferenceValueLeaf::Bytes(_) => ValueType::Bytes,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+/// The Iterator for the object values in the compact document
+pub struct CompactDocObjectIter<'a> {
+    container: &'a CompactDoc,
+    node_addresses_slice: &'a [u8],
+}
+
+impl<'a> CompactDocObjectIter<'a> {
+    fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
+        // Objects are `&[ValueAddr]` serialized into bytes
+        let node_addresses_slice = container.extract_bytes(addr);
+        Ok(Self {
+            container,
+            node_addresses_slice,
+        })
+    }
+}
+
+impl<'a> Iterator for CompactDocObjectIter<'a> {
+    type Item = (&'a str, CompactDocValue<'a>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.node_addresses_slice.is_empty() {
+            return None;
+        }
+        let key_addr = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
+        let key = self.container.extract_str(key_addr.val_addr);
+        let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
+        let value = CompactDocValue {
+            container: self.container,
+            value_addr: value,
+        };
+        Some((key, value))
+    }
+}
+
+#[derive(Debug, Clone)]
+/// The Iterator for the array values in the compact document
+pub struct CompactDocArrayIter<'a> {
+    container: &'a CompactDoc,
+    node_addresses_slice: &'a [u8],
+}
+
+impl<'a> CompactDocArrayIter<'a> {
+    fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
+        // Arrays are &[ValueAddr] serialized into bytes
+        let node_addresses_slice = container.extract_bytes(addr);
+        Ok(Self {
+            container,
+            node_addresses_slice,
+        })
+    }
+}
+
+impl<'a> Iterator for CompactDocArrayIter<'a> {
+    type Item = CompactDocValue<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.node_addresses_slice.is_empty() {
+            return None;
+        }
+        let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
+        let value = CompactDocValue {
+            container: self.container,
+            value_addr: value,
+        };
+        Some(value)
+    }
+}
+
+impl Document for CompactDoc {
+    type Value<'a> = CompactDocValue<'a>;
+    type FieldsValuesIter<'a> = FieldValueIterRef<'a>;
+
+    fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
+        FieldValueIterRef {
+            slice: self.field_values.iter(),
+            container: self,
+        }
+    }
+}
+
+/// A helper wrapper for creating an iterator over the field values
+pub struct FieldValueIterRef<'a> {
+    slice: std::slice::Iter<'a, FieldValueAddr>,
+    container: &'a CompactDoc,
+}
+
+impl<'a> Iterator for FieldValueIterRef<'a> {
+    type Item = (Field, CompactDocValue<'a>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.slice.next().map(|field_value| {
+            (
+                Field::from_field_id(field_value.field as u32),
+                CompactDocValue::<'a> {
+                    container: self.container,
+                    value_addr: field_value.value_addr,
+                },
+            )
+        })
+    }
 }

 /// Error that may happen when deserializing
@@ -264,7 +725,40 @@ mod tests {
        let text_field = schema_builder.add_text_field("title", TEXT);
        let mut doc = TantivyDocument::default();
        doc.add_text(text_field, "My title");
-        assert_eq!(doc.field_values().len(), 1);
+        assert_eq!(doc.field_values().count(), 1);
+
+        let schema = schema_builder.build();
+        let _val = doc.get_first(text_field).unwrap();
+        let _json = doc.to_named_doc(&schema);
+    }
+
+    #[test]
+    fn test_json_value() {
+        let json_str = r#"{ 
+            "toto": "titi",
+            "float": -0.2,
+            "bool": true,
+            "unsigned": 1,
+            "signed": -2,
+            "complexobject": {
+                "field.with.dot": 1
+            },
+            "date": "1985-04-12T23:20:50.52Z",
+            "my_arr": [2, 3, {"my_key": "two tokens"}, 4, {"nested_array": [2, 5, 6, [7, 8, {"a": [{"d": {"e":[99]}}, 9000]}, 9, 10], [5, 5]]}]
+        }"#;
+        let json_val: std::collections::BTreeMap<String, OwnedValue> =
+            serde_json::from_str(json_str).unwrap();
+
+        let mut schema_builder = Schema::builder();
+        let json_field = schema_builder.add_json_field("json", TEXT);
+        let mut doc = TantivyDocument::default();
+        doc.add_object(json_field, json_val);
+
+        let schema = schema_builder.build();
+        let json = doc.to_json(&schema);
+        let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap();
+        let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap();
+        assert_eq!(actual_json["json"][0], expected_json);
    }

    // TODO: Should this be re-added with the serialize method
--- a/src/schema/document/existing_type_impls.rs
+++ b/src/schema/document/existing_type_impls.rs
@@ -5,21 +5,39 @@
 //! and don't care about some of the more specialised types or only want to customise
 //! part of the document structure.
 use std::collections::{btree_map, hash_map, BTreeMap, HashMap};
+use std::iter::Empty;
+use std::net::Ipv6Addr;

+use common::DateTime;
 use serde_json::Number;
+use time::format_description::well_known::Rfc3339;
+use time::OffsetDateTime;

+use super::facet::Facet;
 use super::ReferenceValueLeaf;
 use crate::schema::document::{
    ArrayAccess, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
    ObjectAccess, ReferenceValue, Value, ValueDeserialize, ValueDeserializer, ValueVisitor,
 };
 use crate::schema::Field;
+use crate::tokenizer::PreTokenizedString;

 // Serde compatibility support.
+pub fn can_be_rfc3339_date_time(text: &str) -> bool {
+    if let Some(&first_byte) = text.as_bytes().first() {
+        if first_byte.is_ascii_digit() {
+            return true;
+        }
+    }
+
+    false
+}
+
 impl<'a> Value<'a> for &'a serde_json::Value {
    type ArrayIter = std::slice::Iter<'a, serde_json::Value>;
    type ObjectIter = JsonObjectIter<'a>;

+    #[inline]
    fn as_value(&self) -> ReferenceValue<'a, Self> {
        match self {
            serde_json::Value::Null => ReferenceValueLeaf::Null.into(),
@@ -35,7 +53,19 @@ impl<'a> Value<'a> for &'a serde_json::Value {
                    panic!("Unsupported serde_json number {number}");
                }
            }
-            serde_json::Value::String(val) => ReferenceValueLeaf::Str(val).into(),
+            serde_json::Value::String(text) => {
+                if can_be_rfc3339_date_time(text) {
+                    match OffsetDateTime::parse(text, &Rfc3339) {
+                        Ok(dt) => {
+                            let dt_utc = dt.to_offset(time::UtcOffset::UTC);
+                            ReferenceValueLeaf::Date(DateTime::from_utc(dt_utc)).into()
+                        }
+                        Err(_) => ReferenceValueLeaf::Str(text).into(),
+                    }
+                } else {
+                    ReferenceValueLeaf::Str(text).into()
+                }
+            }
            serde_json::Value::Array(elements) => ReferenceValue::Array(elements.iter()),
            serde_json::Value::Object(object) => {
                ReferenceValue::Object(JsonObjectIter(object.iter()))
@@ -44,6 +74,126 @@ impl<'a> Value<'a> for &'a serde_json::Value {
    }
 }

+impl<'a> Value<'a> for &'a String {
+    type ArrayIter = Empty<&'a String>;
+    type ObjectIter = Empty<(&'a str, &'a String)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
+    }
+}
+
+impl<'a> Value<'a> for &'a Facet {
+    type ArrayIter = Empty<&'a Facet>;
+    type ObjectIter = Empty<(&'a str, &'a Facet)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Facet(self.encoded_str()))
+    }
+}
+
+impl<'a> Value<'a> for &'a u64 {
+    type ArrayIter = Empty<&'a u64>;
+    type ObjectIter = Empty<(&'a str, &'a u64)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::U64(**self))
+    }
+}
+
+impl<'a> Value<'a> for &'a i64 {
+    type ArrayIter = Empty<&'a i64>;
+    type ObjectIter = Empty<(&'a str, &'a i64)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::I64(**self))
+    }
+}
+impl<'a> Value<'a> for &'a f64 {
+    type ArrayIter = Empty<&'a f64>;
+    type ObjectIter = Empty<(&'a str, &'a f64)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::F64(**self))
+    }
+}
+impl<'a> Value<'a> for &'a bool {
+    type ArrayIter = Empty<&'a bool>;
+    type ObjectIter = Empty<(&'a str, &'a bool)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Bool(**self))
+    }
+}
+impl<'a> Value<'a> for &'a str {
+    type ArrayIter = Empty<&'a str>;
+    type ObjectIter = Empty<(&'a str, &'a str)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
+    }
+}
+impl<'a> Value<'a> for &'a &'a str {
+    type ArrayIter = Empty<&'a &'a str>;
+    type ObjectIter = Empty<(&'a str, &'a &'a str)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
+    }
+}
+
+impl<'a> Value<'a> for &'a [u8] {
+    type ArrayIter = Empty<&'a [u8]>;
+    type ObjectIter = Empty<(&'a str, &'a [u8])>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
+    }
+}
+
+impl<'a> Value<'a> for &'a &'a [u8] {
+    type ArrayIter = Empty<&'a &'a [u8]>;
+    type ObjectIter = Empty<(&'a str, &'a &'a [u8])>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
+    }
+}
+
+impl<'a> Value<'a> for &'a Vec<u8> {
+    type ArrayIter = Empty<&'a Vec<u8>>;
+    type ObjectIter = Empty<(&'a str, &'a Vec<u8>)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
+    }
+}
+
+impl<'a> Value<'a> for &'a DateTime {
+    type ArrayIter = Empty<&'a DateTime>;
+    type ObjectIter = Empty<(&'a str, &'a DateTime)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::Date(**self))
+    }
+}
+impl<'a> Value<'a> for &'a Ipv6Addr {
+    type ArrayIter = Empty<&'a Ipv6Addr>;
+    type ObjectIter = Empty<(&'a str, &'a Ipv6Addr)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::IpAddr(**self))
+    }
+}
+impl<'a> Value<'a> for &'a PreTokenizedString {
+    type ArrayIter = Empty<&'a PreTokenizedString>;
+    type ObjectIter = Empty<(&'a str, &'a PreTokenizedString)>;
+    #[inline]
+    fn as_value(&self) -> ReferenceValue<'a, Self> {
+        ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(Box::new((*self).clone())))
+    }
+}
+
 impl ValueDeserialize for serde_json::Value {
    fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
    where D: ValueDeserializer<'de> {
--- a/src/schema/document/mod.rs
+++ b/src/schema/document/mod.rs
@@ -172,7 +172,9 @@ pub use self::de::{
    ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
    ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
 };
-pub use self::default_document::{DocParsingError, TantivyDocument};
+pub use self::default_document::{
+    CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument,
+};
 pub use self::owned_value::OwnedValue;
 pub(crate) use self::se::BinaryDocumentSerializer;
 pub use self::value::{ReferenceValue, ReferenceValueLeaf, Value};
@@ -233,7 +235,7 @@ pub trait Document: Send + Sync + 'static {
            let field_name = schema.get_field_name(field);
            let values: Vec<OwnedValue> = field_values
                .into_iter()
-                .map(|val| val.as_value().into())
+                .map(|val| OwnedValue::from(val.as_value()))
                .collect();
            field_map.insert(field_name.to_string(), values);
        }
--- a/src/schema/document/owned_value.rs
+++ b/src/schema/document/owned_value.rs
@@ -8,6 +8,7 @@ use serde::de::{MapAccess, SeqAccess};
 use time::format_description::well_known::Rfc3339;
 use time::OffsetDateTime;

+use super::existing_type_impls::can_be_rfc3339_date_time;
 use super::ReferenceValueLeaf;
 use crate::schema::document::{
    ArrayAccess, DeserializeError, ObjectAccess, ReferenceValue, Value, ValueDeserialize,
@@ -375,16 +376,6 @@ impl From<BTreeMap<String, OwnedValue>> for OwnedValue {
    }
 }

-fn can_be_rfc3339_date_time(text: &str) -> bool {
-    if let Some(&first_byte) = text.as_bytes().first() {
-        if first_byte.is_ascii_digit() {
-            return true;
-        }
-    }
-
-    false
-}
-
 impl From<serde_json::Value> for OwnedValue {
    fn from(value: serde_json::Value) -> Self {
        match value {
@@ -472,6 +463,7 @@ mod tests {
        let mut doc = TantivyDocument::default();
        doc.add_bytes(bytes_field, "".as_bytes());
        let json_string = doc.to_json(&schema);
+
        assert_eq!(json_string, r#"{"my_bytes":[""]}"#);
    }

--- a/src/schema/document/se.rs
+++ b/src/schema/document/se.rs
@@ -25,6 +25,7 @@ where W: Write

    /// Attempts to serialize a given document and write the output
    /// to the writer.
+    #[inline]
    pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()>
    where D: Document {
        let stored_field_values = || {
@@ -57,9 +58,8 @@ where W: Write
            return Err(io::Error::new(
                io::ErrorKind::Other,
                format!(
-                    "Unexpected number of entries written to serializer, expected {} entries, got \
-                     {} entries",
-                    num_field_values, actual_length,
+                    "Unexpected number of entries written to serializer, expected \
+                     {num_field_values} entries, got {actual_length} entries",
                ),
            ));
        }
@@ -679,6 +679,7 @@ mod tests {
        );
    }

+    #[inline]
    fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> {
        let mut writer = Vec::new();

--- a/src/schema/document/value.rs
+++ b/src/schema/document/value.rs
@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
    /// Returns the field value represented by an enum which borrows it's data.
    fn as_value(&self) -> ReferenceValue<'a, Self>;

-    #[inline]
-    /// Returns if the value is `null` or not.
-    fn is_null(&self) -> bool {
-        matches!(
-            self.as_value(),
-            ReferenceValue::Leaf(ReferenceValueLeaf::Null)
-        )
-    }
-
    #[inline]
    /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
    fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
            None
        }
    }
-
-    #[inline]
-    /// Returns true if the Value is an array.
-    fn is_array(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
-
-    #[inline]
-    /// Returns true if the Value is an object.
-    fn is_object(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
 }

 /// A enum representing a leaf value for tantivy to index.
@@ -159,6 +138,69 @@ pub enum ReferenceValueLeaf<'a> {
    PreTokStr(Box<PreTokenizedString>),
 }

+impl From<u64> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: u64) -> Self {
+        ReferenceValueLeaf::U64(value)
+    }
+}
+
+impl From<i64> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: i64) -> Self {
+        ReferenceValueLeaf::I64(value)
+    }
+}
+
+impl From<f64> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: f64) -> Self {
+        ReferenceValueLeaf::F64(value)
+    }
+}
+
+impl From<bool> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: bool) -> Self {
+        ReferenceValueLeaf::Bool(value)
+    }
+}
+
+impl<'a> From<&'a str> for ReferenceValueLeaf<'a> {
+    #[inline]
+    fn from(value: &'a str) -> Self {
+        ReferenceValueLeaf::Str(value)
+    }
+}
+
+impl<'a> From<&'a [u8]> for ReferenceValueLeaf<'a> {
+    #[inline]
+    fn from(value: &'a [u8]) -> Self {
+        ReferenceValueLeaf::Bytes(value)
+    }
+}
+
+impl From<DateTime> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: DateTime) -> Self {
+        ReferenceValueLeaf::Date(value)
+    }
+}
+
+impl From<Ipv6Addr> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(value: Ipv6Addr) -> Self {
+        ReferenceValueLeaf::IpAddr(value)
+    }
+}
+
+impl From<PreTokenizedString> for ReferenceValueLeaf<'_> {
+    #[inline]
+    fn from(val: PreTokenizedString) -> Self {
+        ReferenceValueLeaf::PreTokStr(Box::new(val))
+    }
+}
+
 impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> {
    #[inline]
    fn from(value: ReferenceValueLeaf<'a>) -> Self {
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -568,21 +568,21 @@ mod tests {
        let schema = schema_builder.build();
        let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap();
        assert_eq!(
-            &OwnedValue::Str("100".to_string()),
-            doc.get_first(text_field).unwrap()
+            OwnedValue::Str("100".to_string()),
+            doc.get_first(text_field).unwrap().into()
        );

        let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap();
        assert_eq!(
-            &OwnedValue::Str("true".to_string()),
-            doc.get_first(text_field).unwrap()
+            OwnedValue::Str("true".to_string()),
+            doc.get_first(text_field).unwrap().into()
        );

        // Not sure if this null coercion is the best approach
        let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap();
        assert_eq!(
-            &OwnedValue::Str("null".to_string()),
-            doc.get_first(text_field).unwrap()
+            OwnedValue::Str("null".to_string()),
+            doc.get_first(text_field).unwrap().into()
        );
    }

@@ -595,9 +595,18 @@ mod tests {
        let schema = schema_builder.build();
        let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
        let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
-        assert_eq!(&OwnedValue::I64(100), doc.get_first(i64_field).unwrap());
-        assert_eq!(&OwnedValue::U64(100), doc.get_first(u64_field).unwrap());
-        assert_eq!(&OwnedValue::F64(100.0), doc.get_first(f64_field).unwrap());
+        assert_eq!(
+            OwnedValue::I64(100),
+            doc.get_first(i64_field).unwrap().into()
+        );
+        assert_eq!(
+            OwnedValue::U64(100),
+            doc.get_first(u64_field).unwrap().into()
+        );
+        assert_eq!(
+            OwnedValue::F64(100.0),
+            doc.get_first(f64_field).unwrap().into()
+        );
    }

    #[test]
@@ -607,11 +616,17 @@ mod tests {
        let schema = schema_builder.build();
        let doc_json = r#"{"bool": "true"}"#;
        let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
-        assert_eq!(&OwnedValue::Bool(true), doc.get_first(bool_field).unwrap());
+        assert_eq!(
+            OwnedValue::Bool(true),
+            doc.get_first(bool_field).unwrap().into()
+        );

        let doc_json = r#"{"bool": "false"}"#;
        let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
-        assert_eq!(&OwnedValue::Bool(false), doc.get_first(bool_field).unwrap());
+        assert_eq!(
+            OwnedValue::Bool(false),
+            doc.get_first(bool_field).unwrap().into()
+        );
    }

    #[test]
@@ -644,7 +659,7 @@ mod tests {
        let schema = schema_builder.build();
        let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
        let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
-        let date = doc.get_first(date_field).unwrap();
+        let date = OwnedValue::from(doc.get_first(date_field).unwrap());
        // Time zone is converted to UTC
        assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
    }
--- a/src/schema/field_value.rs
+++ b/src/schema/field_value.rs
@@ -1,46 +0,0 @@
-use crate::schema::{Field, OwnedValue};
-
-/// `FieldValue` holds together a `Field` and its `Value`.
-#[allow(missing_docs)]
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-pub struct FieldValue {
-    pub field: Field,
-    pub value: OwnedValue,
-}
-
-impl FieldValue {
-    /// Constructor
-    pub fn new(field: Field, value: OwnedValue) -> FieldValue {
-        FieldValue { field, value }
-    }
-
-    /// Field accessor
-    pub fn field(&self) -> Field {
-        self.field
-    }
-
-    /// Value accessor
-    pub fn value(&self) -> &OwnedValue {
-        &self.value
-    }
-}
-
-impl From<FieldValue> for OwnedValue {
-    fn from(field_value: FieldValue) -> Self {
-        field_value.value
-    }
-}
-
-/// A helper wrapper for creating standard iterators
-/// out of the fields iterator trait.
-pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>);
-
-impl<'a> Iterator for FieldValueIter<'a> {
-    type Item = (Field, &'a OwnedValue);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.0
-            .next()
-            .map(|field_value| (field_value.field, &field_value.value))
-    }
-}
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -114,7 +114,6 @@ pub(crate) mod term;

 mod field_entry;
 mod field_type;
-mod field_value;

 mod bytes_options;
 mod date_time_options;
@@ -138,7 +137,6 @@ pub use self::facet_options::FacetOptions;
 pub use self::field::Field;
 pub use self::field_entry::FieldEntry;
 pub use self::field_type::{FieldType, Type};
-pub use self::field_value::FieldValue;
 pub use self::flags::{COERCE, FAST, INDEXED, STORED};
 pub use self::index_record_option::IndexRecordOption;
 pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -645,15 +645,15 @@ mod tests {
        let doc =
            TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
        assert_eq!(
-            doc.get_all(title).collect::<Vec<_>>(),
+            doc.get_all(title).map(OwnedValue::from).collect::<Vec<_>>(),
            vec![
-                &OwnedValue::from("title1".to_string()),
-                &OwnedValue::from("title2".to_string())
+                OwnedValue::from("title1".to_string()),
+                OwnedValue::from("title2".to_string())
            ]
        );
        assert_eq!(
-            doc.get_all(val).collect::<Vec<_>>(),
-            vec![&OwnedValue::from(14u64), &OwnedValue::from(-1i64)]
+            doc.get_all(val).map(OwnedValue::from).collect::<Vec<_>>(),
+            vec![OwnedValue::from(14u64), OwnedValue::from(-1i64)]
        );
    }

@@ -682,7 +682,7 @@ mod tests {
        let schema = schema_builder.build();
        {
            let doc = TantivyDocument::parse_json(&schema, "{}").unwrap();
-            assert!(doc.field_values().is_empty());
+            assert!(doc.field_values().next().is_none());
        }
        {
            let doc = TantivyDocument::parse_json(
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -59,9 +59,8 @@ pub mod tests {
    use super::*;
    use crate::directory::{Directory, RamDirectory, WritePtr};
    use crate::fastfield::AliveBitSet;
-    use crate::schema::document::Value;
    use crate::schema::{
-        self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
+        self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
    };
    use crate::{Index, IndexWriter, Term};

@@ -92,8 +91,8 @@ pub mod tests {
                StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap();
            for i in 0..num_docs {
                let mut doc = TantivyDocument::default();
-                doc.add_field_value(field_body, LOREM.to_string());
-                doc.add_field_value(field_title, format!("Doc {i}"));
+                doc.add_text(field_body, LOREM);
+                doc.add_text(field_title, format!("Doc {i}"));
                store_writer.store(&doc, &schema).unwrap();
            }
            store_writer.close().unwrap();
@@ -119,10 +118,11 @@ pub mod tests {
        let store = StoreReader::open(store_file, 10)?;
        for i in 0..NUM_DOCS as u32 {
            assert_eq!(
-                *store
+                store
                    .get::<TantivyDocument>(i)?
                    .get_first(field_title)
                    .unwrap()
+                    .as_value()
                    .as_str()
                    .unwrap(),
                format!("Doc {i}")
@@ -131,7 +131,13 @@ pub mod tests {

        for doc in store.iter::<TantivyDocument>(Some(&alive_bitset)) {
            let doc = doc?;
-            let title_content = doc.get_first(field_title).unwrap().as_str().unwrap();
+            let title_content = doc
+                .get_first(field_title)
+                .unwrap()
+                .as_value()
+                .as_str()
+                .unwrap()
+                .to_string();
            if !title_content.starts_with("Doc ") {
                panic!("unexpected title_content {title_content}");
            }
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -403,8 +403,7 @@ mod tests {

    use super::*;
    use crate::directory::RamDirectory;
-    use crate::schema::document::Value;
-    use crate::schema::{Field, TantivyDocument};
+    use crate::schema::{Field, TantivyDocument, Value};
    use crate::store::tests::write_lorem_ipsum_store;
    use crate::store::Compressor;
    use crate::Directory;
@@ -412,7 +411,7 @@ mod tests {
    const BLOCK_SIZE: usize = 16_384;

    fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
-        doc.get_first(*field).and_then(|f| f.as_str())
+        doc.get_first(*field).and_then(|f| f.as_value().as_str())
    }

    #[test]
--- a/src/termdict/fst_termdict/termdict.rs
+++ b/src/termdict/fst_termdict/termdict.rs
@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
    let fst = Fst::new(bytes).map_err(|err| {
        io::Error::new(
            io::ErrorKind::InvalidData,
-            format!("Fst data is corrupted: {:?}", err),
+            format!("Fst data is corrupted: {err:?}"),
        )
    })?;
    Ok(tantivy_fst::Map::from(fst))
--- a/src/termdict/tests.rs
+++ b/src/termdict/tests.rs
@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
 #[test]
 fn test_term_dictionary_stream() -> crate::Result<()> {
    let ids: Vec<_> = (0u32..10_000u32)
-        .map(|i| (format!("doc{:0>6}", i), i))
+        .map(|i| (format!("doc{i:0>6}"), i))
        .collect();
    let buffer: Vec<u8> = {
        let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
 #[test]
 fn test_stream_range() -> crate::Result<()> {
    let ids: Vec<_> = (0u32..10_000u32)
-        .map(|i| (format!("doc{:0>6}", i), i))
+        .map(|i| (format!("doc{i:0>6}"), i))
        .collect();
    let buffer: Vec<u8> = {
        let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
--- a/src/tokenizer/facet_tokenizer.rs
+++ b/src/tokenizer/facet_tokenizer.rs
@@ -96,7 +96,7 @@ mod tests {
        {
            let mut add_token = |token: &Token| {
                let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
-                tokens.push(format!("{}", facet));
+                tokens.push(format!("{facet}"));
            };
            FacetTokenizer::default()
                .token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
        {
            let mut add_token = |token: &Token| {
                let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
-                tokens.push(format!("{}", facet));
+                tokens.push(format!("{facet}"));
            };
            FacetTokenizer::default()
                .token_stream(facet.encoded_str()) // ok test
Author	SHA1	Message	Date
Pascal Seitz	2ce485b8cc	skip estimate phase for merge multivalue index precompute stats for merge multivalue index + disable Line encoding for multivalue index. That combination allows to skip the first estimation pass. This gives up to 2x on merge performance on multivalue indices. This change may decrease compression as Line is very good compressible for documents, which have a fixed amount of values in each doc. The line codec should be replaced. ``` merge_multi_and_multi Avg: 22.7880ms (-47.15%) Median: 22.5469ms (-47.38%) [22.3691ms .. 25.8392ms] merge_dense_and_dense Avg: 14.4398ms (+2.18%) Median: 14.2465ms (+0.74%) [14.1620ms .. 16.1270ms] merge_sparse_and_sparse Avg: 10.6559ms (+1.10%) Median: 10.6318ms (+0.91%) [10.5527ms .. 11.2848ms] merge_sparse_and_dense Avg: 12.4886ms (+1.52%) Median: 12.4044ms (+0.84%) [12.3261ms .. 13.9439ms] merge_multi_and_dense Avg: 25.6686ms (-45.56%) Median: 25.4851ms (-45.84%) [25.1618ms .. 27.6226ms] merge_multi_and_sparse Avg: 24.3278ms (-47.00%) Median: 24.1917ms (-47.34%) [23.7159ms .. 27.0513ms] ```	2024-06-11 20:22:00 +08:00
PSeitz	c3b92a5412	fix compiler warning, cleanup (#2393 ) fix compiler warning for missing feature flag remove unused variables cleanup unused methods	2024-06-11 16:03:50 +08:00
PSeitz	2f55511064	extend indexwriter proptests (#2342 ) * index random values in proptest * add proptest with multiple docs	2024-06-11 16:02:57 +08:00
trinity-1686a	08b9fc0b31	fix de-escaping too much in query parser (#2427 ) * fix de-escaping too much in query parser	2024-06-10 11:19:01 +02:00
PSeitz	714f363d43	add bench & test for columnar merging (#2428 ) * add merge columnar proptest * add columnar merge benchmark	2024-06-10 16:26:16 +08:00
PSeitz	93ff7365b0	reduce top hits aggregation memory consumption (#2426 ) move request structure out of top hits aggregation collector and use from the passed structure instead full terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 425.9680ms (-21.38%) Median: 415.1097ms (-23.56%) [395.5303ms .. 484.6325ms] dense terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 440.0817ms (-19.68%) Median: 432.2286ms (-21.10%) [403.5632ms .. 497.7541ms] sparse terms_many_with_top_hits Memory: 13.1 MB (-49.31%) Avg: 33.3568ms (-32.19%) Median: 33.0834ms (-31.86%) [32.5126ms .. 35.7397ms] multivalue terms_many_with_top_hits Memory: 58.2 MB (-43.64%) Avg: 414.2340ms (-25.44%) Median: 413.4144ms (-25.64%) [403.9919ms .. 430.3170ms]	2024-06-06 22:32:58 +08:00
Adam Reichold	8151925068	Panicking in spawned Rayon tasks will abort the process by default. (#2409 )	2024-06-04 17:04:30 +09:00
dependabot[bot]	b960e40bc8	Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423 ) Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version. - [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases) - [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0) --- updated-dependencies: - dependency-name: sketches-ddsketch dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-06-04 15:50:23 +08:00
giovannicuccu	1095c9b073	Issue 1787 extended stats (#2247 ) * first version of extended stats along with its tests * using IntermediateExtendStats instead of IntermediateStats with all tests passing * Created struct for request and response * first test with extended_stats * kahan summation and tests with approximate equality * version ready for merge * removed approx dependency * refactor for using ExtendedStats only when needed * interim version * refined version with code formatted * refactored a struct * cosmetic refactor * fix after merge * fix format * added extended_stat bench * merge and new benchmark for extended stats * split stat segment collectors * wrapped intermediate extended stat with a box to limit memory usage * Revert "wrapped intermediate extended stat with a box to limit memory usage" This reverts commit `5b4aa9f393`. * some code reformat, commented kahan summation * refactor after review * refactor after code review * fix after incorrectly restoring kahan summation * modifications for code review + bug fix in merge_fruit * refactor assert_nearly_equals macro * update after code review --------- Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>	2024-06-04 14:25:17 +08:00
PSeitz	c0686515a9	update one_shot (#2420 )	2024-05-31 11:07:35 +08:00
trinity-1686a	455156f51c	improve query parser (#2416 ) * support escape sequence in more place and fix bug with singlequoted strings * add query parser test for range query on default field	2024-05-30 17:29:27 +02:00
Meng Zhang	4143d31865	chore: fix build as the rev is gone (#2417 )	2024-05-29 09:49:16 +08:00
Hamir Mahal	0c634adbe1	style: simplify strings with string interpolation (#2412 ) * style: simplify strings with string interpolation * fix: formatting	2024-05-27 09:16:47 +02:00
PSeitz	2e3641c2ae	return CompactDocValue instead of trait (#2410 ) The CompactDocValue is easier to handle than the trait in some cases like comparison and conversion	2024-05-27 07:33:50 +02:00
Paul Masurel	b806122c81	Fixing flaky test (#2407 )	2024-05-22 10:10:55 +09:00
PSeitz	e1679f3fb9	compact doc (#2402 ) * compact doc * add any value type * pass references when building CompactDoc * remove OwnedValue from API * clippy * clippy * fail on large documents * fmt * cleanup * cleanup * implement Value for different types fix serde_json date Value implementation * fmt * cleanup * fmt * cleanup * store positions instead of pos+len * remove nodes array * remove mediumvec * cleanup * infallible serialize into vec * remove positions indirection * remove 24MB limitation in document use u32 for Addr Remove the 3 byte addressing limitation and use VInt instead * cleanup * extend test * cleanup, add comments * rename, remove pub	2024-05-21 10:16:08 +02:00
dependabot[bot]	5a80420b10	--- (#2406 ) updated-dependencies: - dependency-name: binggan dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-05-21 04:36:32 +02:00
dependabot[bot]	aa26ff5029	Update binggan requirement from 0.6.2 to 0.7.0 (#2401 ) --- updated-dependencies: - dependency-name: binggan dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-05-17 02:53:25 +02:00
dependabot[bot]	e197b59258	Update itertools requirement from 0.12.0 to 0.13.0 (#2400 ) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.12.0...v0.13.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-05-17 02:53:02 +02:00