reuse samples, add EstimateColumn

estimations can be expensive since the samples span the whole column and depending on the implementation get_val can not be easily computed without an index. EstimateColumn adds a view over the column which limits num_vals to 100_000.
Merge pull request #1545 from waywardmonkeys/remove-some-refs
2025-12-28 04:52:55 +00:00 · 2022-09-25 23:54:03 +08:00 · 2022-09-23 15:33:23 +08:00 · 2022-09-23 15:36:29 +09:00 · 2022-09-23 09:06:24 +08:00 · 2022-09-22 22:44:53 +07:00
18 changed files with 368 additions and 107 deletions
--- a/common/src/bitset.rs
+++ b/common/src/bitset.rs
@@ -259,11 +259,7 @@ impl BitSet {
        // we do not check saturated els.
        let higher = el / 64u32;
        let lower = el % 64u32;
-        self.len += if self.tinysets[higher as usize].insert_mut(lower) {
-            1
-        } else {
-            0
-        };
+        self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
    }

    /// Inserts an element in the `BitSet`
@@ -272,11 +268,7 @@ impl BitSet {
        // we do not check saturated els.
        let higher = el / 64u32;
        let lower = el % 64u32;
-        self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
-            1
-        } else {
-            0
-        };
+        self.len -= u64::from(self.tinysets[higher as usize].remove_mut(lower));
    }

    /// Returns true iff the elements is in the `BitSet`.
--- a/common/src/serialize.rs
+++ b/common/src/serialize.rs
@@ -161,8 +161,7 @@ impl FixedSize for u8 {

 impl BinarySerializable for bool {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
-        let val = if *self { 1 } else { 0 };
-        writer.write_u8(val)
+        writer.write_u8(u8::from(*self))
    }
    fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
        let val = reader.read_u8()?;
--- a/fastfield_codecs/src/bitpacked.rs
+++ b/fastfield_codecs/src/bitpacked.rs
@@ -3,6 +3,7 @@ use std::io::{self, Write};
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

+use crate::column::EstimateColumn;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType};

@@ -75,7 +76,7 @@ impl FastFieldCodec for BitpackedCodec {
        Ok(())
    }

-    fn estimate(column: &impl Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
        let num_bits = compute_num_bits(column.max_value());
        let num_bits_uncompressed = 64;
        Some(num_bits as f32 / num_bits_uncompressed as f32)
--- a/fastfield_codecs/src/blockwise_linear.rs
+++ b/fastfield_codecs/src/blockwise_linear.rs
@@ -5,6 +5,7 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom};
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

+use crate::column::EstimateColumn;
 use crate::line::Line;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn};
@@ -71,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
    }

    // Estimate first_chunk and extrapolate
-    fn estimate(column: &impl crate::Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
        if column.num_vals() < 10 * CHUNK_SIZE as u64 {
            return None;
        }
@@ -100,7 +101,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
        Some(num_bits as f32 / num_bits_uncompressed as f32)
    }

-    fn serialize(column: &dyn crate::Column, wrt: &mut impl io::Write) -> io::Result<()> {
+    fn serialize(column: &dyn Column, wrt: &mut impl io::Write) -> io::Result<()> {
        // The BitpackedReader assumes a normalized vector.
        assert_eq!(column.min_value(), 0);
        let mut buffer = Vec::with_capacity(CHUNK_SIZE);
--- a/fastfield_codecs/src/column.rs
+++ b/fastfield_codecs/src/column.rs
@@ -137,6 +137,57 @@ where V: AsRef<[T]> + ?Sized
    }
 }

+// Creates a view over a Column with a limited number of vals. Stats like min max are unchanged
+pub struct EstimateColumn<'a> {
+    column: &'a dyn Column,
+    num_vals: u64,
+}
+impl<'a> EstimateColumn<'a> {
+    pub(crate) fn new(column: &'a dyn Column) -> Self {
+        let limit_num_vals = column.num_vals().min(100_000);
+        Self {
+            column,
+            num_vals: limit_num_vals,
+        }
+    }
+}
+
+impl<'a> Column for EstimateColumn<'a> {
+    fn get_val(&self, idx: u64) -> u64 {
+        (*self.column).get_val(idx)
+    }
+
+    fn min_value(&self) -> u64 {
+        (*self.column).min_value()
+    }
+
+    fn max_value(&self) -> u64 {
+        (*self.column).max_value()
+    }
+
+    fn num_vals(&self) -> u64 {
+        self.num_vals
+    }
+
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new((*self.column).iter().take(self.num_vals as usize))
+    }
+
+    fn get_range(&self, start: u64, output: &mut [u64]) {
+        (*self.column).get_range(start, output)
+    }
+}
+
+impl<'a> From<&'a dyn Column> for EstimateColumn<'a> {
+    fn from(column: &'a dyn Column) -> Self {
+        let limit_num_vals = column.num_vals().min(100_000);
+        Self {
+            column,
+            num_vals: limit_num_vals,
+        }
+    }
+}
+
 struct MonotonicMappingColumn<C, T, Input> {
    from_column: C,
    monotonic_mapping: T,
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -11,6 +11,7 @@ use std::io;
 use std::io::Write;
 use std::sync::Arc;

+use column::EstimateColumn;
 use common::BinarySerializable;
 use compact_space::CompactSpaceDecompressor;
 use ownedbytes::OwnedBytes;
@@ -123,7 +124,7 @@ trait FastFieldCodec: 'static {
    ///
    /// The column iterator should be preferred over using column `get_val` method for
    /// performance reasons.
-    fn serialize(column: &dyn Column<u64>, write: &mut impl Write) -> io::Result<()>;
+    fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;

    /// Returns an estimate of the compression ratio.
    /// If the codec is not applicable, returns `None`.
@@ -132,7 +133,7 @@ trait FastFieldCodec: 'static {
    ///
    /// It could make sense to also return a value representing
    /// computational complexity.
-    fn estimate(column: &impl Column) -> Option<f32>;
+    fn estimate(column: &EstimateColumn) -> Option<f32>;
 }

 pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
@@ -149,6 +150,7 @@ mod tests {

    use crate::bitpacked::BitpackedCodec;
    use crate::blockwise_linear::BlockwiseLinearCodec;
+    use crate::column::EstimateColumn;
    use crate::linear::LinearCodec;
    use crate::serialize::Header;

@@ -159,7 +161,9 @@ mod tests {
        let col = &VecColumn::from(data);
        let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
        let normalized_col = header.normalize_column(col);
-        let estimation = Codec::estimate(&normalized_col)?;
+
+        let limited_column = EstimateColumn::new(&normalized_col);
+        let estimation = Codec::estimate(&limited_column)?;

        let mut out = Vec::new();
        let col = VecColumn::from(data);
@@ -280,14 +284,16 @@ mod tests {
        let data = (10..=20000_u64).collect::<Vec<_>>();
        let data: VecColumn = data.as_slice().into();

-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_le!(linear_interpol_estimation, 0.01);

-        let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
+        let multi_linear_interpol_estimation =
+            BlockwiseLinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_le!(multi_linear_interpol_estimation, 0.2);
        assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);

-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_lt!(linear_interpol_estimation, bitpacked_estimation);
    }
    #[test]
@@ -295,18 +301,20 @@ mod tests {
        let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];

        let data: VecColumn = data.into();
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_le!(linear_interpol_estimation, 0.34);

-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_lt!(bitpacked_estimation, linear_interpol_estimation);
    }

    #[test]
    fn estimation_prefer_bitpacked() {
        let data = VecColumn::from(&[10, 10, 10, 10]);
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_lt!(bitpacked_estimation, linear_interpol_estimation);
    }

@@ -318,10 +326,11 @@ mod tests {

        // in this case the linear interpolation can't in fact not be worse than bitpacking,
        // but the estimator adds some threshold, which leads to estimated worse behavior
-        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
+        let linear_interpol_estimation =
+            LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_le!(linear_interpol_estimation, 0.35);

-        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
+        let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
        assert_le!(bitpacked_estimation, 0.32);
        assert_le!(bitpacked_estimation, linear_interpol_estimation);
    }
--- a/fastfield_codecs/src/line.rs
+++ b/fastfield_codecs/src/line.rs
@@ -67,13 +67,11 @@ impl Line {
        self.intercept.wrapping_add(linear_part)
    }

-    // Same as train, but the intercept is only estimated from provided sample positions
-    pub fn estimate(ys: &dyn Column, sample_positions: &[u64]) -> Self {
-        Self::train_from(ys, sample_positions.iter().cloned())
-    }
-
    // Intercept is only computed from provided positions
-    fn train_from(ys: &dyn Column, positions: impl Iterator<Item = u64>) -> Self {
+    pub fn train_from(
+        ys: &dyn Column,
+        positions_and_values: impl Iterator<Item = (u64, u64)>,
+    ) -> Self {
        let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) {
            num_vals
        } else {
@@ -114,11 +112,8 @@ impl Line {
            intercept: 0,
        };
        let heuristic_shift = y0.wrapping_sub(MID_POINT);
-        line.intercept = positions
-            .map(|pos| {
-                let y = ys.get_val(pos);
-                y.wrapping_sub(line.eval(pos))
-            })
+        line.intercept = positions_and_values
+            .map(|(pos, y)| y.wrapping_sub(line.eval(pos)))
            .min_by_key(|&val| val.wrapping_sub(heuristic_shift))
            .unwrap_or(0u64); //< Never happens.
        line
@@ -135,7 +130,10 @@ impl Line {
    /// This function is only invariable by translation if all of the
    /// `ys` are packaged into half of the space. (See heuristic below)
    pub fn train(ys: &dyn Column) -> Self {
-        Self::train_from(ys, 0..ys.num_vals())
+        Self::train_from(
+            ys,
+            ys.iter().enumerate().map(|(pos, val)| (pos as u64, val)),
+        )
    }
 }

--- a/fastfield_codecs/src/linear.rs
+++ b/fastfield_codecs/src/linear.rs
@@ -4,6 +4,7 @@ use common::BinarySerializable;
 use ownedbytes::OwnedBytes;
 use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

+use crate::column::EstimateColumn;
 use crate::line::Line;
 use crate::serialize::NormalizedHeader;
 use crate::{Column, FastFieldCodec, FastFieldCodecType};
@@ -121,23 +122,23 @@ impl FastFieldCodec for LinearCodec {
    /// where the local maxima for the deviation of the calculated value are and
    /// the offset to shift all values to >=0 is also unknown.
    #[allow(clippy::question_mark)]
-    fn estimate(column: &impl Column) -> Option<f32> {
+    fn estimate(column: &EstimateColumn) -> Option<f32> {
        if column.num_vals() < 3 {
            return None; // disable compressor for this case
        }

        // let's sample at 0%, 5%, 10% .. 95%, 100%
        let num_vals = column.num_vals() as f32 / 100.0;
-        let sample_positions = (0..20)
+        let sample_positions_and_values = (0..20)
            .map(|pos| (num_vals * pos as f32 * 5.0) as u64)
+            .map(|pos| (pos, column.get_val(pos)))
            .collect::<Vec<_>>();

-        let line = Line::estimate(column, &sample_positions);
+        let line = { Line::train_from(column, sample_positions_and_values.iter().cloned()) };

-        let estimated_bit_width = sample_positions
+        let estimated_bit_width = sample_positions_and_values
            .into_iter()
-            .map(|pos| {
-                let actual_value = column.get_val(pos);
+            .map(|(pos, actual_value)| {
                let interpolated_val = line.eval(pos as u64);
                actual_value.wrapping_sub(interpolated_val)
            })
--- a/fastfield_codecs/src/monotonic_mapping.rs
+++ b/fastfield_codecs/src/monotonic_mapping.rs
@@ -36,11 +36,7 @@ impl MonotonicallyMappableToU64 for i64 {
 impl MonotonicallyMappableToU64 for bool {
    #[inline(always)]
    fn to_u64(self) -> u64 {
-        if self {
-            1
-        } else {
-            0
-        }
+        u64::from(self)
    }

    #[inline(always)]
--- a/fastfield_codecs/src/serialize.rs
+++ b/fastfield_codecs/src/serialize.rs
@@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes;

 use crate::bitpacked::BitpackedCodec;
 use crate::blockwise_linear::BlockwiseLinearCodec;
+use crate::column::EstimateColumn;
 use crate::compact_space::CompactSpaceCompressor;
 use crate::linear::LinearCodec;
 use crate::{
@@ -125,23 +126,6 @@ impl BinarySerializable for Header {
    }
 }

-pub fn estimate<T: MonotonicallyMappableToU64>(
-    typed_column: impl Column<T>,
-    codec_type: FastFieldCodecType,
-) -> Option<f32> {
-    let column = monotonic_map_column(typed_column, T::to_u64);
-    let min_value = column.min_value();
-    let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
-        .filter(|gcd| gcd.get() > 1u64);
-    let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
-    let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
-    match codec_type {
-        FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
-        FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
-        FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
-    }
-}
-
 pub fn serialize_u128(
    typed_column: impl Column<u128>,
    output: &mut impl io::Write,
@@ -177,10 +161,29 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
    Ok(())
 }

+pub fn estimate<T: MonotonicallyMappableToU64>(
+    typed_column: impl Column<T>,
+    codec_type: FastFieldCodecType,
+) -> Option<f32> {
+    let column = monotonic_map_column(typed_column, T::to_u64);
+    let min_value = column.min_value();
+    let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
+        .filter(|gcd| gcd.get() > 1u64);
+    let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
+    let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
+    let estimate_column = EstimateColumn::new(&normalized_column);
+    match codec_type {
+        FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&estimate_column),
+        FastFieldCodecType::Linear => LinearCodec::estimate(&estimate_column),
+        FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&estimate_column),
+    }
+}
+
 fn detect_codec(
    column: impl Column<u64>,
    codecs: &[FastFieldCodecType],
 ) -> Option<FastFieldCodecType> {
+    let column: EstimateColumn = EstimateColumn::new(&column);
    let mut estimations = Vec::new();
    for &codec in codecs {
        let estimation_opt = match codec {
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@@ -425,7 +425,7 @@ impl SegmentHistogramCollector {
        let bucket = &mut self.buckets[bucket_pos];
        bucket.doc_count += 1;
        if let Some(sub_aggregation) = self.sub_aggregations.as_mut() {
-            (&mut sub_aggregation[bucket_pos]).collect(doc, bucket_with_accessor)?;
+            sub_aggregation[bucket_pos].collect(doc, bucket_with_accessor)?;
        }
        Ok(())
    }
--- a/src/core/segment_id.rs
+++ b/src/core/segment_id.rs
@@ -57,7 +57,7 @@ impl SegmentId {
    /// Picking the first 8 chars is ok to identify
    /// segments in a display message (e.g. a5c4dfcb).
    pub fn short_uuid_string(&self) -> String {
-        (&self.0.as_simple().to_string()[..8]).to_string()
+        self.0.as_simple().to_string()[..8].to_string()
    }

    /// Returns a segment uuid string.
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -472,6 +472,8 @@ mod tests {
    // There are more tests in directory/mod.rs
    // The following tests are specific to the MmapDirectory

+    use std::time::Duration;
+
    use common::HasLen;

    use super::*;
@@ -610,7 +612,14 @@ mod tests {
                mmap_directory.get_cache_info().mmapped.len()
            );
        }
-        assert!(mmap_directory.get_cache_info().mmapped.is_empty());
-        Ok(())
+        // This test failed on CI. The last Mmap is dropped from the merging thread so there might
+        // be a race condition indeed.
+        for _ in 0..10 {
+            if mmap_directory.get_cache_info().mmapped.is_empty() {
+                return Ok(());
+            }
+            std::thread::sleep(Duration::from_millis(200));
+        }
+        panic!("The cache still contains information. One of the Mmap has not been dropped.");
    }
 }
--- a/src/directory/ram_directory.rs
+++ b/src/directory/ram_directory.rs
@@ -136,6 +136,20 @@ impl RamDirectory {
        Self::default()
    }

+    /// Deep clones the directory.
+    ///
+    /// Ulterior writes on one of the copy
+    /// will not affect the other copy.
+    pub fn deep_clone(&self) -> RamDirectory {
+        let inner_clone = InnerDirectory {
+            fs: self.fs.read().unwrap().fs.clone(),
+            watch_router: Default::default(),
+        };
+        RamDirectory {
+            fs: Arc::new(RwLock::new(inner_clone)),
+        }
+    }
+
    /// Returns the sum of the size of the different files
    /// in the [`RamDirectory`].
    pub fn total_mem_usage(&self) -> usize {
@@ -256,4 +270,23 @@ mod tests {
        assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
        assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);
    }
+
+    #[test]
+    fn test_ram_directory_deep_clone() {
+        let dir = RamDirectory::default();
+        let test = Path::new("test");
+        let test2 = Path::new("test2");
+        dir.atomic_write(test, b"firstwrite").unwrap();
+        let dir_clone = dir.deep_clone();
+        assert_eq!(
+            dir_clone.atomic_read(test).unwrap(),
+            dir.atomic_read(test).unwrap()
+        );
+        dir.atomic_write(test, b"original").unwrap();
+        dir_clone.atomic_write(test, b"clone").unwrap();
+        dir_clone.atomic_write(test2, b"clone2").unwrap();
+        assert_eq!(dir.atomic_read(test).unwrap(), b"original");
+        assert_eq!(&dir_clone.atomic_read(test).unwrap(), b"clone");
+        assert_eq!(&dir_clone.atomic_read(test2).unwrap(), b"clone2");
+    }
 }
--- a/src/fastfield/multivalued/mod.rs
+++ b/src/fastfield/multivalued/mod.rs
@@ -402,6 +402,74 @@ mod bench {
    use crate::schema::{Cardinality, NumericOptions, Schema};
    use crate::Document;

+    fn bench_multi_value_ff_merge_opt(
+        num_docs: usize,
+        segments_every_n_docs: usize,
+        merge_policy: impl crate::indexer::MergePolicy + 'static,
+    ) {
+        let mut builder = crate::schema::SchemaBuilder::new();
+
+        let fast_multi =
+            crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
+        let multi_field = builder.add_f64_field("f64s", fast_multi);
+
+        let index = crate::Index::create_in_ram(builder.build());
+
+        let mut writer = index.writer_for_tests().unwrap();
+        writer.set_merge_policy(Box::new(merge_policy));
+
+        for i in 0..num_docs {
+            let mut doc = crate::Document::new();
+            doc.add_f64(multi_field, 0.24);
+            doc.add_f64(multi_field, 0.27);
+            doc.add_f64(multi_field, 0.37);
+            if i % 3 == 0 {
+                doc.add_f64(multi_field, 0.44);
+            }
+
+            writer.add_document(doc).unwrap();
+            if i % segments_every_n_docs == 0 {
+                writer.commit().unwrap();
+            }
+        }
+
+        {
+            writer.wait_merging_threads().unwrap();
+            let mut writer = index.writer_for_tests().unwrap();
+            let segment_ids = index.searchable_segment_ids().unwrap();
+            writer.merge(&segment_ids).wait().unwrap();
+        }
+
+        // If a merging thread fails, we should end up with more
+        // than one segment here
+        assert_eq!(1, index.searchable_segments().unwrap().len());
+    }
+
+    #[bench]
+    fn bench_multi_value_ff_merge_many_segments(b: &mut Bencher) {
+        let num_docs = 100_000;
+        b.iter(|| {
+            bench_multi_value_ff_merge_opt(num_docs, 1_000, crate::indexer::NoMergePolicy);
+        });
+    }
+
+    #[bench]
+    fn bench_multi_value_ff_merge_many_segments_log_merge(b: &mut Bencher) {
+        let num_docs = 100_000;
+        b.iter(|| {
+            let merge_policy = crate::indexer::LogMergePolicy::default();
+            bench_multi_value_ff_merge_opt(num_docs, 1_000, merge_policy);
+        });
+    }
+
+    #[bench]
+    fn bench_multi_value_ff_merge_few_segments(b: &mut Bencher) {
+        let num_docs = 100_000;
+        b.iter(|| {
+            bench_multi_value_ff_merge_opt(num_docs, 33_000, crate::indexer::NoMergePolicy);
+        });
+    }
+
    fn multi_values(num_docs: usize, vals_per_doc: usize) -> Vec<Vec<u64>> {
        let mut vals = vec![];
        for _i in 0..num_docs {
--- a/src/indexer/delete_queue.rs
+++ b/src/indexer/delete_queue.rs
@@ -246,18 +246,27 @@ impl DeleteCursor {
 mod tests {

    use super::{DeleteOperation, DeleteQueue};
-    use crate::schema::{Field, Term};
+    use crate::query::{Explanation, Scorer, Weight};
+    use crate::{DocId, Score, SegmentReader};
+
+    struct DummyWeight;
+    impl Weight for DummyWeight {
+        fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
+            Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
+        }
+
+        fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
+            Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
+        }
+    }

    #[test]
    fn test_deletequeue() {
        let delete_queue = DeleteQueue::new();

-        let make_op = |i: usize| {
-            let field = Field::from_field_id(1u32);
-            DeleteOperation {
-                opstamp: i as u64,
-                term: Term::from_field_u64(field, i as u64),
-            }
+        let make_op = |i: usize| DeleteOperation {
+            opstamp: i as u64,
+            target: Box::new(DummyWeight),
        };

        delete_queue.push(make_op(1));
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -11,7 +11,6 @@ use super::segment_updater::SegmentUpdater;
 use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit};
 use crate::core::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader};
 use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite};
-use crate::docset::{DocSet, TERMINATED};
 use crate::error::TantivyError;
 use crate::fastfield::write_alive_bitset;
 use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
@@ -20,8 +19,9 @@ use crate::indexer::index_writer_status::IndexWriterStatus;
 use crate::indexer::operation::DeleteOperation;
 use crate::indexer::stamper::Stamper;
 use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
+use crate::query::{Query, TermQuery};
 use crate::schema::{Document, IndexRecordOption, Term};
-use crate::{FutureResult, Opstamp};
+use crate::{FutureResult, IndexReader, Opstamp};

 // Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
 // in the `memory_arena` goes below MARGIN_IN_BYTES.
@@ -57,6 +57,7 @@ pub struct IndexWriter {
    _directory_lock: Option<DirectoryLock>,

    index: Index,
+    index_reader: IndexReader,

    memory_arena_in_bytes_per_thread: usize,

@@ -92,19 +93,14 @@ fn compute_deleted_bitset(

        // A delete operation should only affect
        // document that were inserted before it.
-        let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
-        if let Some(mut docset) =
-            inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)?
-        {
-            let mut doc_matching_deleted_term = docset.doc();
-            while doc_matching_deleted_term != TERMINATED {
-                if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
-                    alive_bitset.remove(doc_matching_deleted_term);
+        delete_op
+            .target
+            .for_each(segment_reader, &mut |doc_matching_delete_query, _| {
+                if doc_opstamps.is_deleted(doc_matching_delete_query, delete_op.opstamp) {
+                    alive_bitset.remove(doc_matching_delete_query);
                    might_have_changed = true;
                }
-                doc_matching_deleted_term = docset.advance();
-            }
-        }
+            })?;
        delete_cursor.advance();
    }
    Ok(might_have_changed)
@@ -302,6 +298,7 @@ impl IndexWriter {

            memory_arena_in_bytes_per_thread,
            index: index.clone(),
+            index_reader: index.reader()?,

            index_writer_status: IndexWriterStatus::from(document_receiver),
            operation_sender: document_sender,
@@ -666,10 +663,33 @@ impl IndexWriter {
    /// Like adds, the deletion itself will be visible
    /// only after calling `commit()`.
    pub fn delete_term(&self, term: Term) -> Opstamp {
+        let query = TermQuery::new(term, IndexRecordOption::Basic);
+        // For backward compatibility, if Term is invalid for the index, do nothing but return an
+        // Opstamp
+        self.delete_query(Box::new(query))
+            .unwrap_or_else(|_| self.stamper.stamp())
+    }
+
+    /// Delete all documents matching a given query.
+    /// Returns an `Err` if the query can't be executed.
+    ///
+    /// Delete operation only affects documents that
+    /// were added in previous commits, and documents
+    /// that were added previously in the same commit.
+    ///
+    /// Like adds, the deletion itself will be visible
+    /// only after calling `commit()`.
+    #[doc(hidden)]
+    pub fn delete_query(&self, query: Box<dyn Query>) -> crate::Result<Opstamp> {
+        let weight = query.weight(&self.index_reader.searcher(), false)?;
+
        let opstamp = self.stamper.stamp();
-        let delete_operation = DeleteOperation { opstamp, term };
+        let delete_operation = DeleteOperation {
+            opstamp,
+            target: weight,
+        };
        self.delete_queue.push(delete_operation);
-        opstamp
+        Ok(opstamp)
    }

    /// Returns the opstamp of the last successful commit.
@@ -738,10 +758,17 @@ impl IndexWriter {
        let (batch_opstamp, stamps) = self.get_batch_opstamps(count);

        let mut adds = AddBatch::default();
+
        for (user_op, opstamp) in user_operations_it.zip(stamps) {
            match user_op {
                UserOperation::Delete(term) => {
-                    let delete_operation = DeleteOperation { opstamp, term };
+                    let query = TermQuery::new(term, IndexRecordOption::Basic);
+                    let weight = query.weight(&self.index_reader.searcher(), false)?;
+
+                    let delete_operation = DeleteOperation {
+                        opstamp,
+                        target: weight,
+                    };
                    self.delete_queue.push(delete_operation);
                }
                UserOperation::Add(document) => {
@@ -786,7 +813,7 @@ mod tests {
    use crate::directory::error::LockError;
    use crate::error::*;
    use crate::indexer::NoMergePolicy;
-    use crate::query::{QueryParser, TermQuery};
+    use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
    use crate::schema::{
        self, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
        TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
@@ -1418,10 +1445,72 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn test_delete_query_with_sort_by_field() -> crate::Result<()> {
+        let mut schema_builder = schema::Schema::builder();
+        let id_field =
+            schema_builder.add_u64_field("id", schema::INDEXED | schema::STORED | schema::FAST);
+        let schema = schema_builder.build();
+
+        let settings = IndexSettings {
+            sort_by_field: Some(IndexSortByField {
+                field: "id".to_string(),
+                order: Order::Desc,
+            }),
+            ..Default::default()
+        };
+
+        let index = Index::builder()
+            .schema(schema)
+            .settings(settings)
+            .create_in_ram()?;
+        let index_reader = index.reader()?;
+        let mut index_writer = index.writer_for_tests()?;
+
+        // create and delete docs in same commit
+        for id in 0u64..5u64 {
+            index_writer.add_document(doc!(id_field => id))?;
+        }
+        for id in 1u64..4u64 {
+            let term = Term::from_field_u64(id_field, id);
+            let not_term = Term::from_field_u64(id_field, 2);
+            let term = Box::new(TermQuery::new(term, Default::default()));
+            let not_term = Box::new(TermQuery::new(not_term, Default::default()));
+
+            let query: BooleanQuery = vec![
+                (Occur::Must, term as Box<dyn Query>),
+                (Occur::MustNot, not_term as Box<dyn Query>),
+            ]
+            .into();
+
+            index_writer.delete_query(Box::new(query))?;
+        }
+        for id in 5u64..10u64 {
+            index_writer.add_document(doc!(id_field => id))?;
+        }
+        index_writer.commit()?;
+        index_reader.reload()?;
+
+        let searcher = index_reader.searcher();
+        assert_eq!(searcher.segment_readers().len(), 1);
+
+        let segment_reader = searcher.segment_reader(0);
+        assert_eq!(segment_reader.num_docs(), 8);
+        assert_eq!(segment_reader.max_doc(), 10);
+        let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
+        let in_order_alive_ids: Vec<u64> = segment_reader
+            .doc_ids_alive()
+            .map(|doc| fast_field_reader.get_val(doc as u64))
+            .collect();
+        assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]);
+        Ok(())
+    }
+
    #[derive(Debug, Clone, Copy)]
    enum IndexingOp {
        AddDoc { id: u64 },
        DeleteDoc { id: u64 },
+        DeleteDocQuery { id: u64 },
        Commit,
        Merge,
    }
@@ -1429,6 +1518,7 @@ mod tests {
    fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
        prop_oneof![
            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
+            (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
            (0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
            (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            (0u64..1u64).prop_map(|_| IndexingOp::Merge),
@@ -1437,7 +1527,8 @@ mod tests {

    fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
        prop_oneof![
-            10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
+            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
+            5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
            50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
            2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
            1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
@@ -1457,6 +1548,10 @@ mod tests {
                    existing_ids.remove(&id);
                    deleted_ids.insert(id);
                }
+                IndexingOp::DeleteDocQuery { id } => {
+                    existing_ids.remove(&id);
+                    deleted_ids.insert(id);
+                }
                _ => {}
            }
        }
@@ -1539,6 +1634,11 @@ mod tests {
                IndexingOp::DeleteDoc { id } => {
                    index_writer.delete_term(Term::from_field_u64(id_field, id));
                }
+                IndexingOp::DeleteDocQuery { id } => {
+                    let term = Term::from_field_u64(id_field, id);
+                    let query = TermQuery::new(term, Default::default());
+                    index_writer.delete_query(Box::new(query))?;
+                }
                IndexingOp::Commit => {
                    index_writer.commit()?;
                }
--- a/src/indexer/operation.rs
+++ b/src/indexer/operation.rs
@@ -1,20 +1,11 @@
+use crate::query::Weight;
 use crate::schema::{Document, Term};
 use crate::Opstamp;

 /// Timestamped Delete operation.
-#[derive(Clone, Eq, PartialEq, Debug)]
 pub struct DeleteOperation {
    pub opstamp: Opstamp,
-    pub term: Term,
-}
-
-impl Default for DeleteOperation {
-    fn default() -> Self {
-        DeleteOperation {
-            opstamp: 0u64,
-            term: Term::new(),
-        }
-    }
+    pub target: Box<dyn Weight>,
 }

 /// Timestamped Add operation.
Author	SHA1	Message	Date
Pascal Seitz	ba3215b469	reuse samples, add EstimateColumn estimations can be expensive since the samples span the whole column and depending on the implementation get_val can not be easily computed without an index. EstimateColumn adds a view over the column which limits num_vals to 100_000.	2022-09-25 23:54:03 +08:00
PSeitz	dac7da780e	Merge pull request #1545 from waywardmonkeys/remove-some-refs clippy: Remove borrows that the compiler will do.	2022-09-23 15:33:23 +08:00
PSeitz	20c87903b2	fix multivalue ff index creation regression (#1543 ) fixes multivalue ff regression by avoiding using `get_val`. Line::train calls repeatedly get_val, but get_val implementation on Column for multivalues is very slow. The fix is to use the iterator instead. Longterm fix should be to remove get_val access in serialization. Old Code test fastfield::bench::bench_multi_value_ff_merge_few_segments ... bench: 46,103,960 ns/iter (+/- 2,066,083) test fastfield::bench::bench_multi_value_ff_merge_many_segments ... bench: 83,073,036 ns/iter (+/- 4,373,615) est fastfield::bench::bench_multi_value_ff_merge_many_segments_log_merge ... bench: 64,178,576 ns/iter (+/- 1,466,700) Current running 3 tests test fastfield::multivalued::bench::bench_multi_value_ff_merge_few_segments ... bench: 57,379,523 ns/iter (+/- 3,220,787) test fastfield::multivalued::bench::bench_multi_value_ff_merge_many_segments ... bench: 90,831,688 ns/iter (+/- 1,445,486) test fastfield::multivalued::bench::bench_multi_value_ff_merge_many_segments_log_merge ... bench: 158,313,264 ns/iter (+/- 28,823,250) With Fix running 3 tests test fastfield::multivalued::bench::bench_multi_value_ff_merge_few_segments ... bench: 57,635,671 ns/iter (+/- 2,707,361) test fastfield::multivalued::bench::bench_multi_value_ff_merge_many_segments ... bench: 91,468,712 ns/iter (+/- 11,393,581) test fastfield::multivalued::bench::bench_multi_value_ff_merge_many_segments_log_merge ... bench: 73,909,138 ns/iter (+/- 15,846,097)	2022-09-23 15:36:29 +09:00
PSeitz	f9c3947803	Merge pull request #1546 from waywardmonkeys/use-ux-from-bool Use u8::from(bool), u64::from(bool).	2022-09-23 09:06:24 +08:00
Bruce Mitchener	e9a384bb15	Use u8::from(bool), u64::from(bool).	2022-09-22 22:44:53 +07:00
Bruce Mitchener	d231671fe2	clippy: Remove borrows that the compiler will do. This started showing up with clippy in rust 1.64.	2022-09-22 22:38:23 +07:00
trinity-1686a	fa3d786a2f	Add support for deleting all documents matching query (#1535 ) * add support for deleting all documents matching query #1494	2022-09-22 21:26:09 +09:00
Paul Masurel	75aafeeb9b	Added a function to deep clone RamDirectory. (#1544 )	2022-09-22 12:04:02 +02:00
PSeitz	6f066c7f65	Merge pull request #1541 from quickwit-oss/add_bench add benchmarks for multivalued fastfield merge	2022-09-22 15:28:00 +08:00
Pascal Seitz	22e56aaee3	add benchmarks for multivalued fastfield merge	2022-09-22 11:25:41 +08:00
Paul Masurel	d641979127	Minor refactor of fast fields (#1538 )	2022-09-21 12:55:03 +09:00