skip estimate phase for merge multivalue index

precompute stats for merge multivalue index + disable Line encoding for multivalue index. That combination allows to skip the first estimation pass. This gives up to 2x on merge performance on multivalue indices. This change may decrease compression as Line is very good compressible for documents, which have a fixed amount of values in each doc. The line codec should be replaced. ``` merge_multi_and_multi Avg: 22.7880ms (-47.15%) Median: 22.5469ms (-47.38%) [22.3691ms .. 25.8392ms] merge_dense_and_dense Avg: 14.4398ms (+2.18%) Median: 14.2465ms (+0.74%) [14.1620ms .. 16.1270ms] merge_sparse_and_sparse Avg: 10.6559ms (+1.10%) Median: 10.6318ms (+0.91%) [10.5527ms .. 11.2848ms] merge_sparse_and_dense Avg: 12.4886ms (+1.52%) Median: 12.4044ms (+0.84%) [12.3261ms .. 13.9439ms] merge_multi_and_dense Avg: 25.6686ms (-45.56%) Median: 25.4851ms (-45.84%) [25.1618ms .. 27.6226ms] merge_multi_and_sparse Avg: 24.3278ms (-47.00%) Median: 24.1917ms (-47.34%) [23.7159ms .. 27.0513ms] ```
fix compiler warning, cleanup (#2393 )
2026-02-25 09:10:37 +00:00 · 2024-06-11 20:22:00 +08:00 · 2024-06-11 16:03:50 +08:00 · 2024-06-11 16:02:57 +08:00
24 changed files with 470 additions and 345 deletions
--- a/columnar/benches/bench_merge.rs
+++ b/columnar/benches/bench_merge.rs
@@ -22,7 +22,7 @@ impl Display for Card {
    }
 }

-const NUM_DOCS: u32 = 100_000;
+const NUM_DOCS: u32 = 1_000_000;

 fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
    use tantivy_columnar::ColumnarWriter;
@@ -88,12 +88,8 @@ fn main() {
                let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
                let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);

-                let _ = black_box(merge_columnar(
-                    &columnar_readers,
-                    &[],
-                    merge_row_order.into(),
-                    &mut out,
-                ));
+                merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
+                black_box(out);
            },
        );
    }
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -73,14 +73,18 @@ fn detect_cardinality(
 pub fn merge_column_index<'a>(
    columns: &'a [ColumnIndex],
    merge_row_order: &'a MergeRowOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    // For simplification, we do not try to detect whether the cardinality could be
    // downgraded thanks to deletes.
    let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
    match merge_row_order {
-        MergeRowOrder::Stack(stack_merge_order) => {
-            merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
-        }
+        MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
+            columns,
+            cardinality_after_merge,
+            stack_merge_order,
+            num_values,
+        ),
        MergeRowOrder::Shuffled(complex_merge_order) => {
            merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
        }
@@ -167,8 +171,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
            ],
        )
        .into();
-        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
-        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
+        let SerializableColumnIndex::Multivalued {
+            indices: start_index_iterable,
+            ..
+        } = merged_column_index
+        else {
            panic!("Excpected a multivalued index")
        };
        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
        Cardinality::Multivalued => {
            let multivalue_start_index =
                merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
-            SerializableColumnIndex::Multivalued(multivalue_start_index)
+            SerializableColumnIndex::Multivalued {
+                indices: multivalue_start_index,
+                stats: None,
+            }
        }
    }
 }
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -1,6 +1,8 @@
 use std::iter;
+use std::num::NonZeroU64;

 use crate::column_index::{SerializableColumnIndex, Set};
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};

@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
    columns: &'a [ColumnIndex],
    cardinality_after_merge: Cardinality,
    stack_merge_order: &'a StackMergeOrder,
+    num_values: u32,
 ) -> SerializableColumnIndex<'a> {
    match cardinality_after_merge {
        Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
                columns,
                stack_merge_order,
            };
-            SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(stacked_multivalued_index),
+                stats: Some(ColumnStats {
+                    gcd: NonZeroU64::new(1).unwrap(),
+                    // The values in the multivalue index are the positions of the values
+                    min_value: 0,
+                    max_value: num_values as u64,
+                    // This is num docs, but it starts at 0 so we need +1
+                    num_rows: stack_merge_order.num_rows() + 1,
+                }),
+            }
        }
    }
 }
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -6,20 +6,29 @@ use std::sync::Arc;
 use common::OwnedBytes;

 use crate::column_values::{
-    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
+    load_u64_based_column_values, serialize_u64_based_column_values,
+    serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
 };
 use crate::iterable::Iterable;
 use crate::{DocId, RowId};

 pub fn serialize_multivalued_index(
    multivalued_index: &dyn Iterable<RowId>,
+    stats: Option<ColumnStats>,
    output: &mut impl Write,
 ) -> io::Result<()> {
-    serialize_u64_based_column_values(
-        multivalued_index,
-        &[CodecType::Bitpacked, CodecType::Linear],
-        output,
-    )?;
+    if let Some(stats) = stats {
+        // TODO: Add something with higher compression that doesn't require a full scan upfront
+        let estimator = CodecType::Bitpacked.estimator();
+        assert!(!estimator.requires_full_scan());
+        serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
+    } else {
+        serialize_u64_based_column_values(
+            multivalued_index,
+            &[CodecType::Bitpacked, CodecType::Linear],
+            output,
+        )?;
+    }
    Ok(())
 }

@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
 impl MultiValueIndex {
    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
        let mut buffer = Vec::new();
-        serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
+        serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
        let bytes = OwnedBytes::new(buffer);
        open_multivalued_index(bytes).unwrap()
    }
--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
 use crate::column_index::ColumnIndex;
+use crate::column_values::ColumnStats;
 use crate::iterable::Iterable;
 use crate::{Cardinality, RowId};

@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
        non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
        num_rows: RowId,
    },
-    // TODO remove the Arc<dyn> apart from serialization this is not
-    // dynamic at all.
-    Multivalued(Box<dyn Iterable<RowId> + 'a>),
+    Multivalued {
+        /// Iterator emitting the indices for the index
+        indices: Box<dyn Iterable<RowId> + 'a>,
+        /// In the merge case we can precompute the column stats
+        stats: Option<ColumnStats>,
+    },
 }

 impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
        match self {
            SerializableColumnIndex::Full => Cardinality::Full,
            SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
-            SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
+            SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
        }
    }
 }
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
            non_null_row_ids,
            num_rows,
        } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
-        SerializableColumnIndex::Multivalued(multivalued_index) => {
-            serialize_multivalued_index(&*multivalued_index, &mut output)?
-        }
+        SerializableColumnIndex::Multivalued {
+            indices: multivalued_index,
+            stats,
+        } => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
    }
    let column_index_num_bytes = output.written_bytes() as u32;
    Ok(column_index_num_bytes)
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -32,7 +32,8 @@ pub use u128_based::{
 };
 pub use u64_based::{
    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
-    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
+    serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
+    ALL_U64_CODEC_TYPES,
 };
 pub use vec_column::VecColumn;

--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
        bit_packer.close(wrt)?;
        Ok(())
    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Bitpacked
+    }
 }

 pub struct BitpackedCodec;
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {

        Ok(())
    }
+
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::BlockwiseLinear
+    }
 }

 pub struct BlockwiseLinearCodec;
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
            self.collect_before_line_estimation(value);
        }
    }
+    fn requires_full_scan(&self) -> bool {
+        true
+    }
+    fn codec_type(&self) -> super::CodecType {
+        super::CodecType::Linear
+    }
 }

 impl LinearCodecEstimator {
--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
    /// This method will be called for each element of the column during
    /// `estimation`.
    fn collect(&mut self, value: u64);
-    /// Finalizes the first pass phase.
+    /// Returns true if the estimator needs a full pass over the column before serialization
+    fn requires_full_scan(&self) -> bool {
+        false
+    }
+    fn codec_type(&self) -> CodecType;
    fn finalize(&mut self) {}
    /// Returns an accurate estimation of the number of bytes that will
    /// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
    wrt: &mut dyn Write,
 ) -> io::Result<()> {
    let mut stats_collector = StatsCollector::default();
-    let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
-        Vec::with_capacity(codec_types.len());
+    let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
    for &codec_type in codec_types {
-        estimators.push((codec_type, codec_type.estimator()));
+        estimators.push(codec_type.estimator());
    }
    for val in vals.boxed_iter() {
        let val_u64 = val.to_u64();
        stats_collector.collect(val_u64);
-        for (_, estimator) in &mut estimators {
+        for estimator in &mut estimators {
            estimator.collect(val_u64);
        }
    }
-    for (_, estimator) in &mut estimators {
+    for estimator in &mut estimators {
        estimator.finalize();
    }
    let stats = stats_collector.stats();
-    let (_, best_codec, best_codec_estimator) = estimators
+    let (_, best_codec) = estimators
        .into_iter()
-        .flat_map(|(codec_type, estimator)| {
+        .flat_map(|estimator| {
            let num_bytes = estimator.estimate(&stats)?;
-            Some((num_bytes, codec_type, estimator))
+            Some((num_bytes, estimator))
        })
-        .min_by_key(|(num_bytes, _, _)| *num_bytes)
+        .min_by_key(|(num_bytes, _)| *num_bytes)
        .ok_or_else(|| {
            io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
        })?;
-    best_codec.to_code().serialize(wrt)?;
-    best_codec_estimator.serialize(
+    serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
+    Ok(())
+}
+
+/// Serializes a given column of u64-mapped values.
+/// The codec estimator needs to be collected fully for the Line codec before calling this.
+pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
+    vals: &dyn Iterable<T>,
+    codec: Box<dyn ColumnCodecEstimator>,
+    stats: ColumnStats,
+    wrt: &mut dyn Write,
+) -> io::Result<()> {
+    codec.codec_type().to_code().serialize(wrt)?;
+    codec.serialize(
        &stats,
        &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
        wrt,
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -3,7 +3,7 @@ mod merge_mapping;
 mod term_merger;

 use std::collections::{BTreeMap, HashSet};
-use std::io;
+use std::io::{self};
 use std::net::Ipv6Addr;
 use std::sync::Arc;

@@ -156,8 +156,15 @@ fn merge_column(
                    column_values.push(None);
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
                }
            }

-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = column_values
+                .iter()
+                .map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            let merge_column_values = MergedColumnValues {
                column_indexes: &column_indexes[..],
                column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
                    }
                }
            }
-            let merged_column_index =
-                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let num_values: u32 = bytes_columns
+                .iter()
+                .map(|vals| {
+                    vals.as_ref()
+                        .map(|idx| idx.term_ord_column.values.num_vals())
+                        .unwrap_or(0)
+                })
+                .sum();
+            let merged_column_index = crate::column_index::merge_column_index(
+                &column_indexes[..],
+                merge_row_order,
+                num_values,
+            );
            merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
        }
    }
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
            let multivalued_index = multivalued_index_builder.finish(num_rows);
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: Default::default(), // TODO: implement stats for u128
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
            if sort_values_within_row {
                sort_values_within_row_in_place(multivalued_index, values);
            }
-            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
+            SerializableColumnIndex::Multivalued {
+                indices: Box::new(multivalued_index),
+                stats: None,
+            }
        }
    };
    crate::column::serialize_column_mappable_to_u64(
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -738,35 +738,22 @@ proptest! {
    #![proptest_config(ProptestConfig::with_cases(1000))]
    #[test]
    fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
-        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
-            .map(|docs| build_columnar(&docs[..]))
-            .collect::<Vec<_>>();
-        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-        let mut output: Vec<u8> = Vec::new();
-        let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
-        crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
-        let merged_columnar = ColumnarReader::open(output).unwrap();
-        let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
-        let expected_merged_columnar = build_columnar(&concat_rows[..]);
-        assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+        test_columnar_docs(columnar_docs);
    }
 }

-#[test]
-fn test_columnar_merging_empty_columnar() {
-    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
-        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
    let columnar_readers: Vec<ColumnarReader> = columnar_docs
        .iter()
        .map(|docs| build_columnar(&docs[..]))
        .collect::<Vec<_>>();
    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
+    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
    crate::merge_columnar(
        &columnar_readers_arr[..],
        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
+        stack_merge_order,
        &mut output,
    )
    .unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
 }

+#[test]
+fn test_columnar_merging_empty_columnar() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
+        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+    test_columnar_docs(columnar_docs);
+}
+#[test]
+fn test_columnar_merging_simple() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
+        vec![],
+        vec![vec![
+            ("c1", ColumnValue::Numerical(0u64.into())),
+            ("c1", ColumnValue::Numerical(0u64.into())),
+        ]],
+    ];
+    test_columnar_docs(columnar_docs);
+}
+
 #[test]
 fn test_columnar_merging_number_columns() {
    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
            vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
        ],
    ];
-    let columnar_readers: Vec<ColumnarReader> = columnar_docs
-        .iter()
-        .map(|docs| build_columnar(&docs[..]))
-        .collect::<Vec<_>>();
-    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
-    let mut output: Vec<u8> = Vec::new();
-    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
-    crate::merge_columnar(
-        &columnar_readers_arr[..],
-        &[],
-        crate::MergeRowOrder::Stack(stack_merge_order),
-        &mut output,
-    )
-    .unwrap();
-    let merged_columnar = ColumnarReader::open(output).unwrap();
-    let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
-        columnar_docs.iter().flatten().cloned().collect();
-    let expected_merged_columnar = build_columnar(&concat_rows[..]);
-    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+    test_columnar_docs(columnar_docs);
 }

 // TODO add non trivial remap and merge
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
 [dev-dependencies]
 proptest = "1.0.0"
 rand = "0.8.4"
+
+[features]
+unstable = [] # useful for benches.
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -808,7 +808,7 @@ mod tests {
    use proptest::prop_oneof;

    use super::super::operation::UserOperation;
-    use crate::collector::TopDocs;
+    use crate::collector::{Count, TopDocs};
    use crate::directory::error::LockError;
    use crate::error::*;
    use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
@@ -1572,20 +1572,74 @@ mod tests {
        Ok(())
    }

-    #[derive(Debug, Clone, Copy)]
+    #[derive(Debug, Clone)]
    enum IndexingOp {
-        AddDoc { id: u64 },
-        DeleteDoc { id: u64 },
-        DeleteDocQuery { id: u64 },
+        AddMultipleDoc {
+            id: u64,
+            num_docs: u64,
+            value: IndexValue,
+        },
+        AddDoc {
+            id: u64,
+            value: IndexValue,
+        },
+        DeleteDoc {
+            id: u64,
+        },
+        DeleteDocQuery {
+            id: u64,
+        },
        Commit,
        Merge,
    }
+    impl IndexingOp {
+        fn add(id: u64) -> Self {
+            IndexingOp::AddDoc {
+                id,
+                value: IndexValue::F64(id as f64),
+            }
+        }
+    }
+
+    use serde::Serialize;
+    #[derive(Debug, Clone, Serialize)]
+    #[serde(untagged)]
+    enum IndexValue {
+        Str(String),
+        F64(f64),
+        U64(u64),
+        I64(i64),
+    }
+    impl Default for IndexValue {
+        fn default() -> Self {
+            IndexValue::F64(0.0)
+        }
+    }
+
+    fn value_strategy() -> impl Strategy<Value = IndexValue> {
+        prop_oneof![
+            any::<f64>().prop_map(IndexValue::F64),
+            any::<u64>().prop_map(IndexValue::U64),
+            any::<i64>().prop_map(IndexValue::I64),
+            any::<String>().prop_map(IndexValue::Str),
+        ]
+    }

    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
        prop_oneof![
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            (0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            (0u64..20u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            ((0u64..20u64), (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1595,7 +1649,17 @@ mod tests {
        prop_oneof![
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
-            50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
+            50 => (0u64..100u64, value_strategy())
+                .prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
+            50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
+                move |(id, num_docs, value)| {
+                    IndexingOp::AddMultipleDoc {
+                        id,
+                        num_docs,
+                        value,
+                    }
+                }
+            ),
            2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
        ]
@@ -1604,19 +1668,27 @@ mod tests {
    fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
        let mut existing_ids = HashMap::new();
        let mut deleted_ids = HashSet::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    *existing_ids.entry(id).or_insert(0) += 1;
-                    deleted_ids.remove(&id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    *existing_ids.entry(*id).or_insert(0) += 1;
+                    deleted_ids.remove(id);
+                }
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value: _,
+                } => {
+                    *existing_ids.entry(*id).or_insert(0) += num_docs;
+                    deleted_ids.remove(id);
                }
                IndexingOp::DeleteDoc { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                IndexingOp::DeleteDocQuery { id } => {
                    existing_ids.remove(&id);
-                    deleted_ids.insert(id);
+                    deleted_ids.insert(*id);
                }
                _ => {}
            }
@@ -1626,16 +1698,19 @@ mod tests {

    fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
        let mut id_list = Vec::new();
-        for &op in ops {
+        for op in ops {
            match op {
-                IndexingOp::AddDoc { id } => {
-                    id_list.push(id);
+                IndexingOp::AddDoc { id, value: _ } => {
+                    id_list.push(*id);
+                }
+                IndexingOp::AddMultipleDoc { id, .. } => {
+                    id_list.push(*id);
                }
                IndexingOp::DeleteDoc { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                IndexingOp::DeleteDocQuery { id } => {
-                    id_list.retain(|el| *el != id);
+                    id_list.retain(|el| el != id);
                }
                _ => {}
            }
@@ -1716,42 +1791,59 @@ mod tests {

        let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);

-        for &op in ops {
-            match op {
-                IndexingOp::AddDoc { id } => {
-                    let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
-                    let ip = ip_from_id(id);
-
-                    if !id_is_full_doc(id) {
-                        // every 3rd doc has no ip field
-                        index_writer.add_document(doc!(
-                            id_field=>id,
-                        ))?;
-                    } else {
-                        let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
-                        index_writer.add_document(doc!(id_field=>id,
-                                json_field=>json,
-                                bytes_field => id.to_le_bytes().as_slice(),
-                                id_opt_field => id,
-                                ip_field => ip,
-                                ips_field => ip,
-                                ips_field => ip,
-                                multi_numbers=> id,
-                                multi_numbers => id,
-                                bool_field => (id % 2u64) != 0,
-                                i64_field => id as i64,
-                                f64_field => id as f64,
-                                date_field => DateTime::from_timestamp_secs(id as i64),
-                                multi_bools => (id % 2u64) != 0,
-                                multi_bools => (id % 2u64) == 0,
-                                text_field => id.to_string(),
-                                facet_field => facet,
-                                large_text_field => LOREM,
-                                multi_text_fields => multi_text_field_text1,
-                                multi_text_fields => multi_text_field_text2,
-                                multi_text_fields => multi_text_field_text3,
-                        ))?;
-                    }
+        let add_docs = |index_writer: &mut IndexWriter,
+                        id: u64,
+                        value: IndexValue,
+                        num: u64|
+         -> crate::Result<()> {
+            let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
+            let ip = ip_from_id(id);
+            let doc = if !id_is_full_doc(id) {
+                // every 3rd doc has no ip field
+                doc!(
+                    id_field=>id,
+                )
+            } else {
+                let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
+                doc!(id_field=>id,
+                        json_field=>json,
+                        bytes_field => id.to_le_bytes().as_slice(),
+                        id_opt_field => id,
+                        ip_field => ip,
+                        ips_field => ip,
+                        ips_field => ip,
+                        multi_numbers=> id,
+                        multi_numbers => id,
+                        bool_field => (id % 2u64) != 0,
+                        i64_field => id as i64,
+                        f64_field => id as f64,
+                        date_field => DateTime::from_timestamp_secs(id as i64),
+                        multi_bools => (id % 2u64) != 0,
+                        multi_bools => (id % 2u64) == 0,
+                        text_field => id.to_string(),
+                        facet_field => facet,
+                        large_text_field => LOREM,
+                        multi_text_fields => multi_text_field_text1,
+                        multi_text_fields => multi_text_field_text2,
+                        multi_text_fields => multi_text_field_text3,
+                )
+            };
+            for _ in 0..num {
+                index_writer.add_document(doc.clone())?;
+            }
+            Ok(())
+        };
+        for op in ops {
+            match op.clone() {
+                IndexingOp::AddMultipleDoc {
+                    id,
+                    num_docs,
+                    value,
+                } => {
+                    add_docs(&mut index_writer, id, value, num_docs)?;
+                }
+                IndexingOp::AddDoc { id, value } => {
+                    add_docs(&mut index_writer, id, value, 1)?;
                }
                IndexingOp::DeleteDoc { id } => {
                    index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -2032,18 +2124,22 @@ mod tests {

            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
        };
+        let count_search = |term: &str, field| {
+            let query = QueryParser::for_index(&index, vec![field])
+                .parse_query(term)
+                .unwrap();
+            searcher.search(&query, &Count).unwrap()
+        };

-        let do_search2 = |term: Term| {
+        let count_search2 = |term: Term| {
            let query = TermQuery::new(term, IndexRecordOption::Basic);
-            let top_docs: Vec<(f32, DocAddress)> =
-                searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
-
-            top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
+            searcher.search(&query, &Count).unwrap()
        };

        for (id, count) in &expected_ids_and_num_occurrences {
+            // skip expensive queries
            let (existing_id, count) = (*id, *count);
-            let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
+            let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
            assert_eq!(get_num_hits(id_field), count);
            if !id_is_full_doc(existing_id) {
                continue;
@@ -2053,29 +2149,31 @@ mod tests {
            assert_eq!(get_num_hits(f64_field), count);

            // Test multi text
-            assert_eq!(
-                do_search("\"test1 test2\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
-            assert_eq!(
-                do_search("\"test2 test3\"", multi_text_fields).len(),
-                num_docs_with_values
-            );
+            if num_docs_with_values < 1000 {
+                assert_eq!(
+                    do_search("\"test1 test2\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+                assert_eq!(
+                    do_search("\"test2 test3\"", multi_text_fields).len(),
+                    num_docs_with_values
+                );
+            }

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test date
            let term = Term::from_field_date(
                date_field,
                DateTime::from_timestamp_secs(existing_id as i64),
            );
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);
        }
        for deleted_id in deleted_ids {
            let assert_field = |field| {
-                assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0);
+                assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
            };
            assert_field(text_field);
            assert_field(f64_field);
@@ -2084,12 +2182,12 @@ mod tests {

            // Test bytes
            let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);

            // Test date
            let term =
                Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
-            assert_eq!(do_search2(term).len() as u64, 0);
+            assert_eq!(count_search2(term), 0);
        }
        // search ip address
        //
@@ -2098,13 +2196,13 @@ mod tests {
            if !id_is_full_doc(existing_id) {
                continue;
            }
-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
            // Test incoming ip as ipv6
            assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);

            let term = Term::from_field_ip_addr(ip_field, ip_addr);
-            assert_eq!(do_search2(term).len() as u64, count);
+            assert_eq!(count_search2(term) as u64, count);

            // Test incoming ip as ipv4
            if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2121,7 +2219,7 @@ mod tests {
        if !sample.is_empty() {
            let (left_sample, right_sample) = sample.split_at(sample.len() / 2);

-            let expected_count = |sample: &[(&u64, &u64)]| {
+            let calc_expected_count = |sample: &[(&u64, &u64)]| {
                sample
                    .iter()
                    .filter(|(id, _)| id_is_full_doc(**id))
@@ -2137,18 +2235,17 @@ mod tests {
            }

            // Query first half
-            if !left_sample.is_empty() {
-                let expected_count = expected_count(left_sample);
-
+            let expected_count = calc_expected_count(left_sample);
+            if !left_sample.is_empty() && expected_count < 1000 {
                let start_range = *left_sample[0].0;
                let end_range = *left_sample.last().unwrap().0;
                let query = gen_query_inclusive("id_opt", start_range, end_range);
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", "*", ip2);
@@ -2160,19 +2257,19 @@ mod tests {
                assert_eq!(do_search_ip_field(&query), expected_count);
            }
            // Query second half
-            if !right_sample.is_empty() {
-                let expected_count = expected_count(right_sample);
+            let expected_count = calc_expected_count(right_sample);
+            if !right_sample.is_empty() && expected_count < 1000 {
                let start_range = *right_sample[0].0;
                let end_range = *right_sample.last().unwrap().0;
                // Range query on id opt field
                let query =
                    gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
-                assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
+                assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);

                // Range query on ip field
                let ip1 = ip_from_id(start_range);
                let ip2 = ip_from_id(end_range);
-                let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+                let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
                let query = gen_query_inclusive("ip", ip1, ip2);
                assert_eq!(do_search_ip_field(&query), expected_count);
                let query = gen_query_inclusive("ip", ip1, "*");
@@ -2197,7 +2294,7 @@ mod tests {
            };
            let ip = ip_from_id(existing_id);

-            let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
+            let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
            // Range query on single value field
            let query = gen_query_inclusive("ip", ip, ip);
            assert_eq!(do_search_ip_field(&query), count);
@@ -2257,7 +2354,7 @@ mod tests {

    #[test]
    fn test_fast_field_range() {
-        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
+        let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
        assert!(test_operation_strategy(&ops, false, true).is_ok());
    }

@@ -2265,8 +2362,8 @@ mod tests {
    fn test_sort_index_on_opt_field_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 81 },
-                IndexingOp::AddDoc { id: 70 },
+                IndexingOp::add(81),
+                IndexingOp::add(70),
                IndexingOp::DeleteDoc { id: 70 }
            ],
            true,
@@ -2275,14 +2372,45 @@ mod tests {
        .is_ok());
    }

+    #[test]
+    fn test_simple_multiple_doc() {
+        assert!(test_operation_strategy(
+            &[
+                IndexingOp::AddMultipleDoc {
+                    id: 7,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 92,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 30,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+                IndexingOp::AddMultipleDoc {
+                    id: 33,
+                    num_docs: 800,
+                    value: IndexValue::U64(0),
+                },
+            ],
+            true,
+            false
+        )
+        .is_ok());
+    }
+
    #[test]
    fn test_ip_range_query_multivalue_bug() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 2 },
+                IndexingOp::add(2),
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 1 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::Merge
            ],
@@ -2296,11 +2424,11 @@ mod tests {
    fn test_ff_num_ips_regression() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(13),
+                IndexingOp::add(1),
                IndexingOp::Commit,
                IndexingOp::DeleteDocQuery { id: 13 },
-                IndexingOp::AddDoc { id: 1 },
+                IndexingOp::add(1),
                IndexingOp::Commit,
            ],
            false,
@@ -2312,7 +2440,7 @@ mod tests {
    #[test]
    fn test_minimal_sort_force_end_merge() {
        assert!(test_operation_strategy(
-            &[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },],
+            &[IndexingOp::add(23), IndexingOp::add(13),],
            false,
            false
        )
@@ -2373,8 +2501,8 @@ mod tests {
    fn test_minimal_sort_force_end_merge_with_delete() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            true,
@@ -2387,8 +2515,8 @@ mod tests {
    fn test_minimal_no_sort_no_force_end_merge() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 23 },
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::add(23),
+                IndexingOp::add(13),
                IndexingOp::DeleteDoc { id: 13 }
            ],
            false,
@@ -2399,7 +2527,7 @@ mod tests {

    #[test]
    fn test_minimal_sort_merge() {
-        assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok());
+        assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
    }

    use proptest::prelude::*;
@@ -2495,14 +2623,14 @@ mod tests {
    fn test_delete_bug_reproduction_ip_addr() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 1 },
-            AddDoc { id: 2 },
+            IndexingOp::add(1),
+            IndexingOp::add(2),
            Commit,
-            AddDoc { id: 3 },
+            IndexingOp::add(3),
            DeleteDoc { id: 1 },
            Commit,
            Merge,
-            AddDoc { id: 4 },
+            IndexingOp::add(4),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2511,7 +2639,13 @@ mod tests {
    #[test]
    fn test_merge_regression_1() {
        use IndexingOp::*;
-        let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
+        let ops = &[
+            IndexingOp::add(15),
+            Commit,
+            IndexingOp::add(9),
+            Commit,
+            Merge,
+        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
    }

@@ -2519,9 +2653,9 @@ mod tests {
    fn test_range_query_bug_1() {
        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 9 },
-            AddDoc { id: 0 },
-            AddDoc { id: 13 },
+            IndexingOp::add(9),
+            IndexingOp::add(0),
+            IndexingOp::add(13),
            Commit,
        ];
        test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2529,12 +2663,11 @@ mod tests {

    #[test]
    fn test_range_query_bug_2() {
-        use IndexingOp::*;
        let ops = &[
-            AddDoc { id: 3 },
-            AddDoc { id: 6 },
-            AddDoc { id: 9 },
-            AddDoc { id: 10 },
+            IndexingOp::add(3),
+            IndexingOp::add(6),
+            IndexingOp::add(9),
+            IndexingOp::add(10),
        ];
        test_operation_strategy(&ops[..], false, false).unwrap();
    }
@@ -2556,7 +2689,7 @@ mod tests {
        assert!(test_operation_strategy(
            &[
                IndexingOp::DeleteDoc { id: 0 },
-                IndexingOp::AddDoc { id: 6 },
+                IndexingOp::add(6),
                IndexingOp::DeleteDocQuery { id: 11 },
                IndexingOp::Commit,
                IndexingOp::Merge,
@@ -2573,10 +2706,13 @@ mod tests {
    fn test_bug_1617_2() {
        assert!(test_operation_strategy(
            &[
-                IndexingOp::AddDoc { id: 13 },
+                IndexingOp::AddDoc {
+                    id: 13,
+                    value: Default::default()
+                },
                IndexingOp::DeleteDoc { id: 13 },
                IndexingOp::Commit,
-                IndexingOp::AddDoc { id: 30 },
+                IndexingOp::add(30),
                IndexingOp::Commit,
                IndexingOp::Merge,
            ],
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -202,9 +202,8 @@ impl SegmentWriter {
            match field_entry.field_type() {
                FieldType::Facet(_) => {
                    let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
                        let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
                }
                FieldType::Str(_) => {
                    let mut indexing_position = IndexingPosition::default();
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        let mut token_stream = if let Some(text) = value.as_str() {
                            let text_analyzer =
                                &mut self.per_field_text_analyzers[field.field_id() as usize];
                            text_analyzer.token_stream(text)
-                        } else if let Some(tok_str) = value.as_pre_tokenized_text() {
+                        } else if let Some(tok_str) = value.into_pre_tokenized_text() {
                            BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
                        } else {
                            continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
                }
                FieldType::U64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
                }
                FieldType::Date(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value_access = value_access as D::Value<'_>;
-                        let value = value_access.as_value();
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
                }
                FieldType::I64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
                }
                FieldType::F64(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
                        term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
                }
                FieldType::Bool(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
                        term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
                }
                FieldType::Bytes(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
-
+                    for value in values {
+                        let value = value.as_value();
                        num_vals += 1;
                        let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
                        term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
                }
                FieldType::IpAddr(_) => {
                    let mut num_vals = 0;
-                    for value_access in values {
-                        // Used to help with linting and type checking.
-                        let value = value_access as D::Value<'_>;
+                    for value in values {
+                        let value = value.as_value();

                        num_vals += 1;
                        let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
--- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
@@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED};
 use crate::fieldnorm::FieldNormReader;
 use crate::postings::Postings;
 use crate::query::bm25::Bm25Weight;
-use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer};
+use crate::query::phrase_query::{intersection_count, PhraseScorer};
 use crate::query::Scorer;
 use crate::{DocId, Score};

@@ -92,17 +92,14 @@ impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
    }
 }

-pub struct PhrasePrefixScorer<TPostings: Postings, const SCORING_ENABLED: bool> {
+pub struct PhrasePrefixScorer<TPostings: Postings> {
    phrase_scorer: PhraseKind<TPostings>,
    suffixes: Vec<TPostings>,
    suffix_offset: u32,
    phrase_count: u32,
-    suffix_position_buffer: Vec<u32>,
 }

-impl<TPostings: Postings, const SCORING_ENABLED: bool>
-    PhrasePrefixScorer<TPostings, SCORING_ENABLED>
-{
+impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
    // If similarity_weight is None, then scoring is disabled.
    pub fn new(
        mut term_postings: Vec<(usize, TPostings)>,
@@ -110,7 +107,7 @@ impl<TPostings: Postings, const SCORING_ENABLED: bool>
        fieldnorm_reader: FieldNormReader,
        suffixes: Vec<TPostings>,
        suffix_pos: usize,
-    ) -> PhrasePrefixScorer<TPostings, SCORING_ENABLED> {
+    ) -> PhrasePrefixScorer<TPostings> {
        // correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
        let max_offset = term_postings
            .iter()
@@ -143,7 +140,6 @@ impl<TPostings: Postings, const SCORING_ENABLED: bool>
            suffixes,
            suffix_offset: (max_offset - suffix_pos) as u32,
            phrase_count: 0,
-            suffix_position_buffer: Vec::with_capacity(100),
        };
        if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
            phrase_prefix_scorer.advance();
@@ -157,6 +153,7 @@ impl<TPostings: Postings, const SCORING_ENABLED: bool>

    fn matches_prefix(&mut self) -> bool {
        let mut count = 0;
+        let mut positions = Vec::new();
        let current_doc = self.doc();
        let pos_matching = self.phrase_scorer.get_intersection();
        for suffix in &mut self.suffixes {
@@ -165,27 +162,16 @@ impl<TPostings: Postings, const SCORING_ENABLED: bool>
            }
            let doc = suffix.seek(current_doc);
            if doc == current_doc {
-                suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
-                if SCORING_ENABLED {
-                    count += intersection_count(pos_matching, &self.suffix_position_buffer);
-                } else {
-                    if intersection_exists(pos_matching, &self.suffix_position_buffer) {
-                        return true;
-                    }
-                }
+                suffix.positions_with_offset(self.suffix_offset, &mut positions);
+                count += intersection_count(pos_matching, &positions);
            }
        }
-        if !SCORING_ENABLED {
-            return false;
-        }
        self.phrase_count = count as u32;
        count != 0
    }
 }

-impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
-    for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
-{
+impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
    fn advance(&mut self) -> DocId {
        loop {
            let doc = self.phrase_scorer.advance();
@@ -212,15 +198,9 @@ impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
    }
 }

-impl<TPostings: Postings, const SCORING_ENABLED: bool> Scorer
-    for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
-{
+impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
    fn score(&mut self) -> Score {
-        if SCORING_ENABLED {
-            self.phrase_scorer.score()
-        } else {
-            1.0f32
-        }
        // TODO modify score??
+        self.phrase_scorer.score()
    }
 }
--- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs
@@ -42,11 +42,11 @@ impl PhrasePrefixWeight {
        Ok(FieldNormReader::constant(reader.max_doc(), 1))
    }

-    pub(crate) fn phrase_prefix_scorer<const SCORING_ENABLED: bool>(
+    pub(crate) fn phrase_scorer(
        &self,
        reader: &SegmentReader,
        boost: Score,
-    ) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings, SCORING_ENABLED>>> {
+    ) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
        let similarity_weight_opt = self
            .similarity_weight_opt
            .as_ref()
@@ -128,20 +128,15 @@ impl PhrasePrefixWeight {

 impl Weight for PhrasePrefixWeight {
    fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
-        if self.similarity_weight_opt.is_some() {
-            if let Some(scorer) = self.phrase_prefix_scorer::<true>(reader, boost)? {
-                return Ok(Box::new(scorer));
-            }
+        if let Some(scorer) = self.phrase_scorer(reader, boost)? {
+            Ok(Box::new(scorer))
        } else {
-            if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? {
-                return Ok(Box::new(scorer));
-            }
+            Ok(Box::new(EmptyScorer))
        }
-        Ok(Box::new(EmptyScorer))
    }

    fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
-        let scorer_opt = self.phrase_prefix_scorer::<true>(reader, 1.0)?;
+        let scorer_opt = self.phrase_scorer(reader, 1.0)?;
        if scorer_opt.is_none() {
            return Err(does_not_match(doc));
        }
@@ -205,7 +200,7 @@ mod tests {
            .unwrap()
            .unwrap();
        let mut phrase_scorer = phrase_weight
-            .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
            .unwrap();
        assert_eq!(phrase_scorer.doc(), 1);
        assert_eq!(phrase_scorer.phrase_count(), 2);
@@ -216,38 +211,6 @@ mod tests {
        Ok(())
    }

-    #[test]
-    pub fn test_phrase_no_count() -> crate::Result<()> {
-        let index = create_index(&[
-            "aa bb dd cc",
-            "aa aa bb c dd aa bb cc aa bb dc",
-            " aa bb cd",
-        ])?;
-        let schema = index.schema();
-        let text_field = schema.get_field("text").unwrap();
-        let searcher = index.reader()?.searcher();
-        let phrase_query = PhrasePrefixQuery::new(vec![
-            Term::from_field_text(text_field, "aa"),
-            Term::from_field_text(text_field, "bb"),
-            Term::from_field_text(text_field, "c"),
-        ]);
-        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
-        let phrase_weight = phrase_query
-            .phrase_prefix_query_weight(enable_scoring)
-            .unwrap()
-            .unwrap();
-        let mut phrase_scorer = phrase_weight
-            .phrase_prefix_scorer::<false>(searcher.segment_reader(0u32), 1.0)?
-            .unwrap();
-        assert_eq!(phrase_scorer.doc(), 1);
-        assert_eq!(phrase_scorer.phrase_count(), 0);
-        assert_eq!(phrase_scorer.advance(), 2);
-        assert_eq!(phrase_scorer.doc(), 2);
-        assert_eq!(phrase_scorer.phrase_count(), 0);
-        assert_eq!(phrase_scorer.advance(), TERMINATED);
-        Ok(())
-    }
-
    #[test]
    pub fn test_phrase_count_mid() -> crate::Result<()> {
        let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
@@ -264,7 +227,7 @@ mod tests {
            .unwrap()
            .unwrap();
        let mut phrase_scorer = phrase_weight
-            .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
+            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
            .unwrap();
        assert_eq!(phrase_scorer.doc(), 1);
        assert_eq!(phrase_scorer.phrase_count(), 2);
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -3,8 +3,8 @@ mod phrase_scorer;
 mod phrase_weight;

 pub use self::phrase_query::PhraseQuery;
+pub(crate) use self::phrase_scorer::intersection_count;
 pub use self::phrase_scorer::PhraseScorer;
-pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists};
 pub use self::phrase_weight::PhraseWeight;

 #[cfg(test)]
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -58,7 +58,7 @@ pub struct PhraseScorer<TPostings: Postings> {
 }

 /// Returns true if and only if the two sorted arrays contain a common element
-pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
+fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
    let mut left_index = 0;
    let mut right_index = 0;
    while left_index < left.len() && right_index < right.len() {
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -2,7 +2,7 @@ use std::fmt;
 use std::ops::Bound;

 use crate::query::Occur;
-use crate::schema::{Field, Term, Type};
+use crate::schema::{Term, Type};
 use crate::Score;

 #[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
        upper: Bound<Term>,
    },
    Set {
-        field: Field,
-        value_type: Type,
        elements: Vec<Term>,
    },
    All,
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -832,17 +832,11 @@ impl QueryParser {
                let (field, json_path) = try_tuple!(self
                    .split_full_path(&full_path)
                    .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
-                let field_entry = self.schema.get_field_entry(field);
-                let value_type = field_entry.field_type().value_type();
                let (elements, errors) = elements
                    .into_iter()
                    .map(|element| self.compute_boundary_term(field, json_path, &element))
                    .partition_result();
-                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
-                    elements,
-                    field,
-                    value_type,
-                }));
+                let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
                (Some(logical_ast), errors)
            }
            UserInputLeaf::Exists { .. } => (
--- a/src/schema/document/value.rs
+++ b/src/schema/document/value.rs
@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
    /// Returns the field value represented by an enum which borrows it's data.
    fn as_value(&self) -> ReferenceValue<'a, Self>;

-    #[inline]
-    /// Returns if the value is `null` or not.
-    fn is_null(&self) -> bool {
-        matches!(
-            self.as_value(),
-            ReferenceValue::Leaf(ReferenceValueLeaf::Null)
-        )
-    }
-
    #[inline]
    /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
    fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
            None
        }
    }
-
-    #[inline]
-    /// Returns true if the Value is an array.
-    fn is_array(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
-
-    #[inline]
-    /// Returns true if the Value is an object.
-    fn is_object(&self) -> bool {
-        matches!(self.as_value(), ReferenceValue::Object(_))
-    }
 }

 /// A enum representing a leaf value for tantivy to index.