fix: restore clone bound on doc id iterator

Co-authored-by: Cursor <cursoragent@cursor.com>
docs: clarify manual doc id mapping serialization
2026-06-29 13:50:43 +00:00 · 2026-06-29 11:31:38 +02:00 · 2026-06-29 11:15:53 +02:00 · 2026-06-29 11:13:35 +02:00 · 2026-06-29 10:52:41 +02:00 · 2026-06-26 17:56:54 +02:00
16 changed files with 301 additions and 1508 deletions
--- a/benches/agg_bench.rs
+++ b/benches/agg_bench.rs
@@ -81,11 +81,6 @@ fn bench_agg(mut group: InputGroup<Index>) {
    register!(group, composite_histogram);
    register!(group, composite_histogram_calendar);

-    // multi_terms aggregation benchmarks
-    register!(group, multi_terms_status_with_zipf_1000);
-    register!(group, multi_terms_zipf_1000_with_status);
-    register!(group, multi_terms_status_with_zipf_1000_sub_agg);
-
    register!(group, cardinality_agg);
    register!(group, cardinality_agg_high_card);
    register!(group, cardinality_agg_low_card);
@@ -573,58 +568,6 @@ fn composite_histogram_calendar(index: &Index) {
    execute_agg(index, agg_req);
 }

-/// multi_terms equivalent of terms_status_with_terms_zipf_1000_sub_agg:
-/// flat GroupBy(status, zipf_1000) vs nested terms(status) -> terms(zipf_1000)
-fn multi_terms_status_with_zipf_1000(index: &Index) {
-    let agg_req = json!({
-        "mt": {
-            "multi_terms": {
-                "terms": [
-                    {"field": "text_few_terms_status"},
-                    {"field": "text_1000_terms_zipf"}
-                ],
-                "size": 10
-            }
-        }
-    });
-    execute_agg(index, agg_req);
-}
-
-/// multi_terms equivalent of terms_zipf_1000_with_terms_status_sub_agg:
-/// flat GroupBy(zipf_1000, status) vs nested terms(zipf_1000) -> terms(status)
-fn multi_terms_zipf_1000_with_status(index: &Index) {
-    let agg_req = json!({
-        "mt": {
-            "multi_terms": {
-                "terms": [
-                    {"field": "text_1000_terms_zipf"},
-                    {"field": "text_few_terms_status"}
-                ],
-                "size": 100
-            }
-        }
-    });
-    execute_agg(index, agg_req);
-}
-
-/// multi_terms on the same field pair as the nested benchmarks, with an avg sub-aggregation
-fn multi_terms_status_with_zipf_1000_sub_agg(index: &Index) {
-    let agg_req = json!({
-        "mt": {
-            "multi_terms": {
-                "terms": [
-                    {"field": "text_few_terms_status"},
-                    {"field": "text_1000_terms_zipf"}
-                ]
-            },
-            "aggs": {
-                "average_f64": { "avg": { "field": "score_f64" } }
-            }
-        }
-    });
-    execute_agg(index, agg_req);
-}
-
 fn execute_agg(index: &Index, agg_req: serde_json::Value) {
    let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
    let collector = get_collector(agg_req);
--- a/columnar/benches/common.rs
+++ b/columnar/benches/common.rs
@@ -54,6 +54,6 @@ pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str)
    }

    let mut wrt: Vec<u8> = Vec::new();
-    columnar_writer.serialize(num_docs, &mut wrt).unwrap();
+    columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
    ColumnarReader::open(wrt).unwrap()
 }
--- a/common/src/bitset.rs
+++ b/common/src/bitset.rs
@@ -281,12 +281,16 @@ impl BitSet {
    }

    /// Inserts an element in the `BitSet`
+    ///
+    /// Returns true if the set changed.
    #[inline]
-    pub fn insert(&mut self, el: u32) {
+    pub fn insert(&mut self, el: u32) -> bool {
        // we do not check saturated els.
        let higher = el / 64u32;
        let lower = el % 64u32;
-        self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
+        let changed = self.tinysets[higher as usize].insert_mut(lower);
+        self.len += u64::from(changed);
+        changed
    }

    /// Inserts an element in the `BitSet`
--- a/src/aggregation/agg_data.rs
+++ b/src/aggregation/agg_data.rs
@@ -13,9 +13,8 @@ use crate::aggregation::bucket::{
    build_segment_filter_collector, build_segment_histogram_collector,
    build_segment_range_collector, CompositeAggReqData, CompositeAggregation,
    CompositeSourceAccessors, FilterAggReqData, HistogramAggReqData, HistogramBounds,
-    IncludeExcludeParam, MissingTermAggReqData, MultiTermsAggReqData, MultiTermsAggregation,
-    MultiTermsFieldAccessors, RangeAggReqData, SegmentMultiTermsCollector, TermMissingAgg,
-    TermsAggReqData, TermsAggregation, TermsAggregationInternal,
+    IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, TermMissingAgg, TermsAggReqData,
+    TermsAggregation, TermsAggregationInternal,
 };
 use crate::aggregation::metric::{
    build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
@@ -77,10 +76,6 @@ impl AggregationsSegmentCtx {
        self.per_request.composite_req_data.push(data);
        self.per_request.composite_req_data.len() - 1
    }
-    pub(crate) fn push_multi_terms_req_data(&mut self, data: MultiTermsAggReqData) -> usize {
-        self.per_request.multi_terms_req_data.push(data);
-        self.per_request.multi_terms_req_data.len() - 1
-    }

    #[inline]
    pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
@@ -130,8 +125,6 @@ pub struct PerRequestAggSegCtx {
    pub missing_term_req_data: Vec<MissingTermAggReqData>,
    /// CompositeAggReqData contains the request data for a composite aggregation.
    pub composite_req_data: Vec<CompositeAggReqData>,
-    /// MultiTermsAggReqData contains the request data for a multi_terms aggregation.
-    pub multi_terms_req_data: Vec<MultiTermsAggReqData>,

    /// Request tree used to build collectors.
    pub agg_tree: Vec<AggRefNode>,
@@ -184,11 +177,6 @@ impl PerRequestAggSegCtx {
                .iter()
                .map(|t| t.get_memory_consumption())
                .sum::<usize>()
-            + self
-                .multi_terms_req_data
-                .iter()
-                .map(|t| t.get_memory_consumption())
-                .sum::<usize>()
            + self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
    }

@@ -206,7 +194,6 @@ impl PerRequestAggSegCtx {
            AggKind::Range => self.range_req_data[idx].name.as_str(),
            AggKind::Filter => self.filter_req_data[idx].name.as_str(),
            AggKind::Composite => self.composite_req_data[idx].name.as_str(),
-            AggKind::MultiTerms => self.multi_terms_req_data[idx].name.as_str(),
        }
    }

@@ -360,9 +347,6 @@ pub(crate) fn build_segment_agg_collector(
                req, node,
            )?,
        )),
-        AggKind::MultiTerms => Ok(Box::new(SegmentMultiTermsCollector::from_req_and_validate(
-            req, node,
-        )?)),
    }
 }

@@ -394,7 +378,6 @@ pub enum AggKind {
    Range,
    Filter,
    Composite,
-    MultiTerms,
 }

 impl AggKind {
@@ -411,7 +394,6 @@ impl AggKind {
            AggKind::Range => "Range",
            AggKind::Filter => "Filter",
            AggKind::Composite => "Composite",
-            AggKind::MultiTerms => "MultiTerms",
        }
    }
 }
@@ -667,14 +649,6 @@ fn build_nodes(
            &req.sub_aggregation,
            composite_req,
        )?]),
-        AggregationVariants::MultiTerms(multi_terms_req) => Ok(vec![build_multi_terms_node(
-            agg_name,
-            reader,
-            segment_ordinal,
-            data,
-            &req.sub_aggregation,
-            multi_terms_req,
-        )?]),
        AggregationVariants::Filter(filter_req) => {
            // Build the query and evaluator upfront
            let schema = reader.schema();
@@ -733,111 +707,6 @@ fn build_composite_node(
    })
 }

-fn build_multi_terms_node(
-    agg_name: &str,
-    reader: &SegmentReader,
-    segment_ordinal: SegmentOrdinal,
-    data: &mut AggregationsSegmentCtx,
-    sub_aggs: &Aggregations,
-    req: &MultiTermsAggregation,
-) -> crate::Result<AggRefNode> {
-    use crate::aggregation::bucket::KeyElem;
-
-    if req.terms.is_empty() {
-        return Err(crate::TantivyError::InvalidArgument(
-            "multi_terms aggregation requires at least one field".to_string(),
-        ));
-    }
-
-    let mut fields = Vec::with_capacity(req.terms.len());
-
-    for field_def in &req.terms {
-        let field_name = &field_def.field;
-        let str_dict_column = reader.fast_fields().str(field_name)?;
-
-        // Collect all columns for this field (handles JSON multi-type fields).
-        let columns = get_term_agg_accessors(reader, field_name, &field_def.missing)?;
-
-        // Precompute the missing KeyElem (or None -> drop combo).
-        let missing_key_elem = if let Some(missing) = &field_def.missing {
-            match missing {
-                Key::Str(missing_str) => {
-                    match columns.iter().position(|(_, ct)| *ct == ColumnType::Str) {
-                        Some(idx) => {
-                            match str_dict_column
-                                .as_ref()
-                                .unwrap()
-                                .dictionary()
-                                .term_ord(missing_str.as_bytes())?
-                            {
-                                Some(ord) => Some(KeyElem::new(idx as u32, ord)),
-                                None => Some(KeyElem::synthetic_missing()),
-                            }
-                        }
-                        None => Some(KeyElem::synthetic_missing()),
-                    }
-                }
-                _ => {
-                    // Non-string missing: find the column whose type best matches the
-                    // missing key. Prefer an exact-type match; fall back to any numeric
-                    // column so cross-type coercions (e.g. Key::F64 on an I64 column)
-                    // still work.
-                    let preferred_type = match missing {
-                        Key::F64(_) => ColumnType::F64,
-                        Key::I64(_) => ColumnType::I64,
-                        Key::U64(_) => ColumnType::U64,
-                        Key::Str(_) => unreachable!("handled by Key::Str arm"),
-                    };
-                    let idx = columns
-                        .iter()
-                        .position(|(_, ct)| *ct == preferred_type)
-                        .or_else(|| {
-                            columns
-                                .iter()
-                                .position(|(_, ct)| ct.numerical_type().is_some())
-                        });
-                    match idx {
-                        Some(idx) => {
-                            let (col, col_type) = &columns[idx];
-                            get_missing_val_as_u64_lenient(
-                                *col_type,
-                                col.max_value(),
-                                missing,
-                                field_name,
-                            )?
-                            .map(|sentinel| KeyElem::new(idx as u32, sentinel))
-                        }
-                        None => Some(KeyElem::synthetic_missing()),
-                    }
-                }
-            }
-        } else {
-            None
-        };
-
-        fields.push(MultiTermsFieldAccessors {
-            columns,
-            str_dict_column,
-            missing: field_def.missing.clone(),
-            missing_key_elem,
-            field: field_name.clone(),
-        });
-    }
-
-    let idx = data.push_multi_terms_req_data(MultiTermsAggReqData {
-        name: agg_name.to_string(),
-        req: req.clone(),
-        fields,
-        sub_aggregations: sub_aggs.clone(),
-    });
-    let children = build_children(sub_aggs, reader, segment_ordinal, data)?;
-    Ok(AggRefNode {
-        kind: AggKind::MultiTerms,
-        idx_in_req_data: idx,
-        children,
-    })
-}
-
 fn build_children(
    aggs: &Aggregations,
    reader: &SegmentReader,
@@ -1062,7 +931,9 @@ fn build_allowed_term_ids_for_str(
        // add matches
        allowed = Some(BitSet::with_max_value(allowed_capacity));
        let allowed = allowed.as_mut().unwrap();
-        for_each_matching_term_ord(str_col, include, |ord| allowed.insert(ord))?;
+        for_each_matching_term_ord(str_col, include, |ord| {
+            let _ = allowed.insert(ord);
+        })?;
    };

    if let Some(exclude) = exclude {
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -33,7 +33,7 @@ use serde::{Deserialize, Serialize};

 use super::bucket::{
    CompositeAggregation, DateHistogramAggregationReq, FilterAggregation, HistogramAggregation,
-    MultiTermsAggregation, RangeAggregation, TermsAggregation,
+    RangeAggregation, TermsAggregation,
 };
 use super::metric::{
    AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
@@ -202,9 +202,6 @@ pub enum AggregationVariants {
    /// Multi-dimensional, paginable bucket aggregation.
    #[serde(rename = "composite")]
    Composite(CompositeAggregation),
-    /// Bucket aggregation over unique combinations of values across multiple term fields.
-    #[serde(rename = "multi_terms")]
-    MultiTerms(MultiTermsAggregation),

    // Metric aggregation types
    /// Computes the average of the extracted values.
@@ -256,9 +253,6 @@ impl AggregationVariants {
                .iter()
                .map(|source| source.field())
                .collect(),
-            AggregationVariants::MultiTerms(mt) => {
-                mt.terms.iter().map(|t| t.field.as_str()).collect()
-            }
            AggregationVariants::Average(avg) => vec![avg.field_name()],
            AggregationVariants::Count(count) => vec![count.field_name()],
            AggregationVariants::Max(max) => vec![max.field_name()],
@@ -299,12 +293,6 @@ impl AggregationVariants {
            _ => None,
        }
    }
-    pub(crate) fn as_multi_terms(&self) -> Option<&MultiTermsAggregation> {
-        match &self {
-            AggregationVariants::MultiTerms(mt) => Some(mt),
-            _ => None,
-        }
-    }
    pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
        match &self {
            AggregationVariants::Percentiles(percentile_req) => Some(percentile_req),
--- a/src/aggregation/agg_result.rs
+++ b/src/aggregation/agg_result.rs
@@ -152,25 +152,12 @@ pub enum BucketResult {
        ///
        /// See [`TermsAggregation`](super::bucket::TermsAggregation)
        buckets: Vec<BucketEntry>,
-        /// The number of documents that didn't make it into to TOP N due to shard_size or size
+        /// The number of documents that didn’t make it into to TOP N due to shard_size or size
        sum_other_doc_count: u64,
        #[serde(skip_serializing_if = "Option::is_none")]
        /// The upper bound error for the doc count of each term.
        doc_count_error_upper_bound: Option<u64>,
    },
-    /// This is the multi_terms result -- placed AFTER Terms so that a zero-bucket result
-    /// deserializes as Terms (the more common case). Non-empty MultiTerms still deserializes
-    /// correctly because its array `key` fails Terms' scalar `key` check first. The only known
-    /// ambiguity is an empty MultiTerms result decoding as Terms (deserialization only).
-    MultiTerms {
-        /// The buckets (one per unique combination of field values).
-        buckets: Vec<MultiTermsBucketEntry>,
-        /// The number of documents that didn't make it into the TOP N.
-        sum_other_doc_count: u64,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        /// The upper bound error for the doc count of each term combination.
-        doc_count_error_upper_bound: Option<u64>,
-    },
    /// This is the filter result - a single bucket with sub-aggregations
    Filter(FilterBucketResult),
    /// This is the composite result
@@ -192,11 +179,6 @@ impl BucketResult {
            BucketResult::Histogram { buckets } => {
                buckets.iter().map(|bucket| bucket.get_bucket_count()).sum()
            }
-            BucketResult::MultiTerms {
-                buckets,
-                sum_other_doc_count: _,
-                doc_count_error_upper_bound: _,
-            } => buckets.iter().map(|bucket| bucket.get_bucket_count()).sum(),
            BucketResult::Terms {
                buckets,
                sum_other_doc_count: _,
@@ -290,35 +272,6 @@ impl GetDocCount for BucketEntry {
    }
 }

-/// Bucket entry for a [`multi_terms`](super::bucket::MultiTermsAggregation) aggregation.
-///
-/// The key is a vector of values (one per declared field), and `key_as_string` is the pipe-joined
-/// representation.
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
-pub struct MultiTermsBucketEntry {
-    /// Pipe-joined string representation of all key elements, e.g. `"rock|Product A"`.
-    pub key_as_string: String,
-    /// The composite key: one [`Key`] per field in declaration order.
-    pub key: Vec<Key>,
-    /// Number of documents in this bucket.
-    pub doc_count: u64,
-    /// Sub-aggregation results.
-    #[serde(flatten)]
-    pub sub_aggregation: AggregationResults,
-}
-
-impl MultiTermsBucketEntry {
-    pub(crate) fn get_bucket_count(&self) -> u64 {
-        1 + self.sub_aggregation.get_bucket_count()
-    }
-}
-
-impl GetDocCount for MultiTermsBucketEntry {
-    fn doc_count(&self) -> u64 {
-        self.doc_count
-    }
-}
-
 /// This is the range entry for a bucket, which contains a key, count, and optionally
 /// sub-aggregations.
 ///
--- a/src/aggregation/bucket/mod.rs
+++ b/src/aggregation/bucket/mod.rs
@@ -25,7 +25,6 @@
 mod composite;
 mod filter;
 mod histogram;
-mod multi_terms;
 mod range;
 mod term_agg;
 mod term_missing_agg;
@@ -36,7 +35,6 @@ use std::fmt;
 pub use composite::*;
 pub use filter::*;
 pub use histogram::*;
-pub use multi_terms::*;
 pub use range::*;
 use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
 pub use term_agg::*;
--- a/src/aggregation/bucket/multi_terms/mod.rs
+++ b/src/aggregation/bucket/multi_terms/mod.rs
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -28,7 +28,7 @@ use super::{format_date, AggregationError, Key, SerializedKey};
 use crate::aggregation::agg_result::{
    AggregationResults, BucketEntries, BucketEntry, CompositeBucketEntry, FilterBucketResult,
 };
-use crate::aggregation::bucket::{IntermediateMultiTermsBucketResult, TermsAggregationInternal};
+use crate::aggregation::bucket::TermsAggregationInternal;
 use crate::aggregation::metric::CardinalityCollector;
 use crate::TantivyError;

@@ -82,7 +82,7 @@ impl From<IntermediateKey> for Key {
                }
            }
            IntermediateKey::F64(f) => Self::F64(f),
-            IntermediateKey::Bool(f) => Self::Str(f.to_string()),
+            IntermediateKey::Bool(f) => Self::U64(f as u64),
            IntermediateKey::U64(f) => Self::U64(f),
            IntermediateKey::I64(f) => Self::I64(f),
        }
@@ -286,11 +286,6 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
                buckets: IntermediateCompositeBucketResult::default(),
            })
        }
-        MultiTerms(_) => {
-            IntermediateAggregationResult::Bucket(IntermediateBucketResult::MultiTerms {
-                buckets: Default::default(),
-            })
-        }
    }
 }

@@ -504,11 +499,6 @@ pub enum IntermediateBucketResult {
        /// The composite buckets
        buckets: IntermediateCompositeBucketResult,
    },
-    /// Multi-terms aggregation
-    MultiTerms {
-        /// The multi-terms buckets
-        buckets: IntermediateMultiTermsBucketResult,
-    },
 }

 impl IntermediateBucketResult {
@@ -611,13 +601,6 @@ impl IntermediateBucketResult {
                    .expect("unexpected aggregation, expected composite aggregation");
                buckets.into_final_result(composite_req, req.sub_aggregation(), limits)
            }
-            IntermediateBucketResult::MultiTerms { buckets } => {
-                let multi_terms_req = req
-                    .agg
-                    .as_multi_terms()
-                    .expect("unexpected aggregation, expected multi_terms aggregation");
-                buckets.into_final_result(multi_terms_req, req.sub_aggregation(), limits)
-            }
        }
    }

@@ -694,14 +677,6 @@ impl IntermediateBucketResult {
            ) => {
                composite_left.merge_fruits(composite_right)?;
            }
-            (
-                IntermediateBucketResult::MultiTerms { buckets: mt_left },
-                IntermediateBucketResult::MultiTerms { buckets: mt_right },
-            ) => {
-                merge_maps(&mut mt_left.entries, mt_right.entries)?;
-                mt_left.sum_other_doc_count += mt_right.sum_other_doc_count;
-                mt_left.doc_count_error_upper_bound += mt_right.doc_count_error_upper_bound;
-            }
            (IntermediateBucketResult::Range(_), _) => {
                panic!("try merge on different types")
            }
@@ -717,9 +692,6 @@ impl IntermediateBucketResult {
            (IntermediateBucketResult::Composite { .. }, _) => {
                panic!("try merge on different types")
            }
-            (IntermediateBucketResult::MultiTerms { .. }, _) => {
-                panic!("try merge on different types")
-            }
        }
        Ok(())
    }
--- a/src/core/tests.rs
+++ b/src/core/tests.rs
@@ -1,14 +1,14 @@
 use crate::collector::Count;
 use crate::directory::{RamDirectory, WatchCallback};
 use crate::index::SegmentId;
-use crate::indexer::{LogMergePolicy, NoMergePolicy};
+use crate::indexer::{DocIdMapping, LogMergePolicy, NoMergePolicy};
 use crate::postings::Postings;
 use crate::query::TermQuery;
-use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
+use crate::schema::{Field, IndexRecordOption, Schema, Value, INDEXED, STORED, STRING, TEXT};
 use crate::tokenizer::TokenizerManager;
 use crate::{
-    Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
-    TantivyDocument, Term,
+    Directory, DocAddress, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter,
+    ReloadPolicy, TantivyDocument, Term,
 };

 #[test]
@@ -300,6 +300,51 @@ fn test_single_segment_index_writer() -> crate::Result<()> {
    Ok(())
 }

+#[test]
+fn test_single_segment_index_writer_with_doc_id_mapping() -> crate::Result<()> {
+    let mut schema_builder = Schema::builder();
+    let text_field = schema_builder.add_text_field("text", TEXT | STORED);
+    let schema = schema_builder.build();
+    let directory = RamDirectory::default();
+    let settings = IndexSettings {
+        manual_doc_id_mapping: true,
+        ..Default::default()
+    };
+    let mut single_segment_index_writer = Index::builder()
+        .schema(schema)
+        .settings(settings)
+        .single_segment_index_writer(directory, 15_000_000)?;
+
+    single_segment_index_writer.add_document(doc!(text_field=>"alpha beta"))?;
+    single_segment_index_writer.add_document(doc!())?;
+    single_segment_index_writer.add_document(doc!(text_field=>"gamma"))?;
+
+    let mapping = DocIdMapping::new_permutation(vec![2, 1, 0])?;
+    let index = single_segment_index_writer.finalize_with_doc_id_mapping(&mapping)?;
+
+    let searcher = index.reader()?.searcher();
+    let segment_reader = searcher.segment_reader(0);
+    let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
+
+    assert_eq!(fieldnorm_reader.fieldnorm(0), 1);
+    assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
+    assert_eq!(fieldnorm_reader.fieldnorm(2), 2);
+
+    let doc_0 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
+    assert_eq!(
+        doc_0.get_first(text_field).and_then(|val| val.as_str()),
+        Some("gamma")
+    );
+    let doc_1 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
+    assert!(doc_1.get_first(text_field).is_none());
+    let doc_2 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
+    assert_eq!(
+        doc_2.get_first(text_field).and_then(|val| val.as_str()),
+        Some("alpha beta")
+    );
+    Ok(())
+}
+
 #[test]
 fn test_merging_segment_update_docfreq() {
    let mut schema_builder = Schema::builder();
--- a/src/index/index_meta.rs
+++ b/src/index/index_meta.rs
@@ -250,6 +250,11 @@ pub struct IndexSettings {
    /// provided in `IndexSortByField`
    #[serde(skip_serializing_if = "Option::is_none")]
    pub sort_by_field: Option<IndexSortByField>,
+    /// If true, enables caller-provided doc id mappings at segment finalization time.
+    /// Always skip serializing this field since it's only used at segment finalization time.
+    #[doc(hidden)]
+    #[serde(skip)]
+    pub manual_doc_id_mapping: bool,
    /// The `Compressor` used to compress the doc store.
    #[serde(default)]
    pub docstore_compression: Compressor,
@@ -273,6 +278,7 @@ impl Default for IndexSettings {
    fn default() -> Self {
        Self {
            sort_by_field: None,
+            manual_doc_id_mapping: false,
            docstore_compression: Compressor::default(),
            docstore_blocksize: default_docstore_blocksize(),
            docstore_compress_dedicated_thread: true,
@@ -460,6 +466,7 @@ mod tests {
                    field: "text".to_string(),
                    order: Order::Asc,
                }),
+                manual_doc_id_mapping: false,
                docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
                    compression_level: Some(4),
                }),
@@ -529,6 +536,7 @@ mod tests {
            index_settings,
            IndexSettings {
                sort_by_field: None,
+                manual_doc_id_mapping: false,
                docstore_compression: Compressor::default(),
                docstore_compress_dedicated_thread: true,
                docstore_blocksize: 16_384
@@ -547,6 +555,18 @@ mod tests {
                serde_json::from_value(index_settings_json).unwrap();
            assert_eq!(index_settings_deser, index_settings);
        }
+        {
+            index_settings.manual_doc_id_mapping = true;
+            let index_settings_json = serde_json::to_value(&index_settings).unwrap();
+            assert_eq!(
+                index_settings_json,
+                serde_json::json!({
+                    "docstore_compression": "lz4",
+                    "docstore_blocksize": 16384
+                })
+            );
+            index_settings.manual_doc_id_mapping = false;
+        }
        {
            index_settings.docstore_compress_dedicated_thread = false;
            let index_settings_json = serde_json::to_value(&index_settings).unwrap();
--- a/src/indexer/doc_id_mapping.rs
+++ b/src/indexer/doc_id_mapping.rs
@@ -1,7 +1,6 @@
 //! This module is used when sorting the index by a property, e.g.
 //! to get mappings from old doc_id to new doc_id and vice versa, after sorting
-
-use common::ReadOnlyBitSet;
+use common::{BitSet, ReadOnlyBitSet};

 use super::SegmentWriter;
 use crate::schema::{Field, Schema};
@@ -71,7 +70,33 @@ pub struct DocIdMapping {
 }

 impl DocIdMapping {
-    pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
+    /// Creates a `DocIdMapping` from a mapping of new doc ids to old doc ids, with permutation validation.
+    /// The mapping is validated by checking that every old doc id appears exactly once in the mapping.
+    /// I.e., doc ids must be consecutive from `0` to `new_doc_id_to_old.len() - 1`, inclusive.
+    pub fn new_permutation(new_doc_id_to_old: Vec<DocId>) -> crate::Result<Self> {
+        // Check that the mapping is a permutation of the segment doc ids.
+        let max_doc = new_doc_id_to_old.len() as DocId;
+        let mut old_doc_id_to_new = vec![0; max_doc as usize];
+
+        let mut seen_doc_ids = BitSet::with_max_value(max_doc);
+        for (i, old_doc_id) in new_doc_id_to_old.iter().copied().enumerate() {
+            if old_doc_id >= max_doc || !seen_doc_ids.insert(old_doc_id) {
+                return Err(TantivyError::InvalidArgument(
+                    "Mapping must be a permutation of the segment doc ids".to_string(),
+                ));
+            }
+            old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
+        }
+
+        let doc_id_mapping = DocIdMapping {
+            new_doc_id_to_old,
+            old_doc_id_to_new,
+        };
+        Ok(doc_id_mapping)
+    }
+
+    /// Creates a `DocIdMapping` from a mapping of new doc ids to old doc ids.
+    pub(crate) fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
        let max_doc = new_doc_id_to_old.len();
        let old_max_doc = new_doc_id_to_old
            .iter()
@@ -89,35 +114,41 @@ impl DocIdMapping {
        }
    }

-    /// returns the new doc_id for the old doc_id
-    pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
+    /// Returns the new doc_id for the old doc_id
+    pub(crate) fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
        self.old_doc_id_to_new[doc_id as usize]
    }
-    /// returns the old doc_id for the new doc_id
-    pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
-        self.new_doc_id_to_old[doc_id as usize]
-    }
-    /// iterate over old doc_ids in order of the new doc_ids
-    pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
-        self.new_doc_id_to_old.iter().cloned()
+
+    /// Iiterate over old doc_ids in order of the new doc_ids
+    pub(crate) fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
+        self.new_doc_id_to_old.iter().copied()
    }

-    pub fn old_to_new_ids(&self) -> &[DocId] {
+    /// Returns the new doc_ids in order of the old doc_ids
+    pub(crate) fn old_to_new_ids(&self) -> &[DocId] {
        &self.old_doc_id_to_new[..]
    }

    /// Remaps a given array to the new doc ids.
-    pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
+    pub(crate) fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
        self.new_doc_id_to_old
            .iter()
            .map(|old_doc| els[*old_doc as usize])
            .collect()
    }
-    pub fn num_new_doc_ids(&self) -> usize {
+
+    /// Returns the number of documents in the mapping.
+    pub(crate) fn len(&self) -> usize {
+        // new_doc_id_to_old and old_doc_id_to_new have the same length by construction.
        self.new_doc_id_to_old.len()
    }
-    pub fn num_old_doc_ids(&self) -> usize {
-        self.old_doc_id_to_new.len()
+}
+
+#[cfg(test)]
+impl DocIdMapping {
+    /// returns the old doc_id for the new doc_id
+    fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
+        self.new_doc_id_to_old[doc_id as usize]
    }
 }

@@ -158,7 +189,7 @@ mod tests_indexsorting {
    use crate::indexer::doc_id_mapping::DocIdMapping;
    use crate::indexer::NoMergePolicy;
    use crate::query::QueryParser;
-    use crate::schema::*;
+    use crate::{schema::*, TantivyError};
    use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};

    fn create_test_index(
@@ -550,6 +581,18 @@ mod tests_indexsorting {
        assert_eq!(doc_mapping.get_new_doc_id(5), 2);
    }

+    #[test]
+    fn test_doc_mapping_new_permutation_rejects_out_of_range() {
+        let result = DocIdMapping::new_permutation(vec![5, 0]);
+        assert!(matches!(result, Err(TantivyError::InvalidArgument(_)),));
+    }
+
+    #[test]
+    fn test_doc_mapping_new_permutation_rejects_duplicates() {
+        let result = DocIdMapping::new_permutation(vec![0, 1, 0]);
+        assert!(matches!(result, Err(TantivyError::InvalidArgument(_)),));
+    }
+
    #[test]
    fn test_doc_mapping_remap() {
        let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -33,6 +33,7 @@ mod stamper;
 use crossbeam_channel as channel;
 use smallvec::SmallVec;

+pub use self::doc_id_mapping::DocIdMapping;
 pub use self::index_writer::{advance_deletes, IndexWriter, IndexWriterOptions};
 pub use self::log_merge_policy::LogMergePolicy;
 pub use self::merge_operation::MergeOperation;
--- a/src/indexer/segment_serializer.rs
+++ b/src/indexer/segment_serializer.rs
@@ -4,7 +4,7 @@ use crate::directory::WritePtr;
 use crate::fieldnorm::FieldNormsSerializer;
 use crate::index::{Segment, SegmentComponent};
 use crate::postings::InvertedIndexSerializer;
-use crate::store::StoreWriter;
+use crate::store::{Compressor, StoreWriter};

 /// Segment serializer is in charge of laying out on disk
 /// the data accumulated and sorted by the `SegmentWriter`.
@@ -25,17 +25,18 @@ impl SegmentSerializer {
        // If the segment is going to be sorted, we stream the docs first to a temporary file.
        // In the merge case this is not necessary because we can kmerge the already sorted
        // segments
-        let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
        let settings = segment.index().settings().clone();
+        let remapping_required =
+            (settings.sort_by_field.is_some() || settings.manual_doc_id_mapping) && !is_in_merge;
        let store_writer = if remapping_required {
            let store_write = segment.open_write(SegmentComponent::TempStore)?;
            StoreWriter::new(
                store_write,
-                crate::store::Compressor::None,
+                Compressor::None,
                // We want fast random access on the docs, so we choose a small block size.
                // If this is zero, the skip index will contain too many checkpoints and
                // therefore will be relatively slow.
-                16000,
+                16_000,
                settings.docstore_compress_dedicated_thread,
            )?
        } else {
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -136,10 +136,8 @@ impl SegmentWriter {

    /// Lay on disk the current content of the `SegmentWriter`
    ///
-    /// Finalize consumes the `SegmentWriter`, so that it cannot
-    /// be used afterwards.
-    pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
-        self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
+    /// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
+    pub fn finalize(self) -> crate::Result<Vec<u64>> {
        let mapping: Option<DocIdMapping> = self
            .segment_serializer
            .segment()
@@ -149,6 +147,41 @@ impl SegmentWriter {
            .clone()
            .map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
            .transpose()?;
+        self.finalize_inner(mapping.as_ref())
+    }
+
+    /// Lay on disk the current content of the `SegmentWriter` using the provided doc id mapping.
+    ///
+    /// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
+    pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Vec<u64>> {
+        // Ensure the segment writer was created in remap mode so the docstore can be reordered.
+        if !self
+            .segment_serializer
+            .segment()
+            .index()
+            .settings()
+            .manual_doc_id_mapping
+        {
+            return Err(TantivyError::InvalidArgument(
+                "IndexSettings::manual_doc_id_mapping must be set to true".to_string(),
+            ));
+        }
+
+        // Check that the mapping eventually covers all documents in the segment.
+        if mapping.len() != self.max_doc as usize {
+            return Err(TantivyError::InvalidArgument(format!(
+                "Mapping must cover all documents in this segment. Expected {} documents, got {}",
+                self.max_doc,
+                mapping.len()
+            )));
+        }
+
+        self.finalize_inner(Some(mapping))
+    }
+
+    fn finalize_inner(mut self, mapping: Option<&DocIdMapping>) -> crate::Result<Vec<u64>> {
+        // Pad before remapping; the mapping indexes fieldnorms by old doc id.
+        self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
        remap_and_write(
            self.schema,
            &self.per_field_postings_writers,
@@ -156,9 +189,9 @@ impl SegmentWriter {
            self.fast_field_writers,
            &self.fieldnorms_writer,
            self.segment_serializer,
-            mapping.as_ref(),
+            mapping,
        )?;
-        let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
+        let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping);
        Ok(doc_opstamps)
    }

@@ -485,6 +518,7 @@ mod tests {
    use crate::collector::{Count, TopDocs};
    use crate::directory::RamDirectory;
    use crate::fastfield::FastValue;
+    use crate::indexer::doc_id_mapping::DocIdMapping;
    use crate::postings::{Postings, TermInfo};
    use crate::query::{PhraseQuery, QueryParser};
    use crate::schema::{
@@ -497,7 +531,7 @@ mod tests {
    use crate::tokenizer::{PreTokenizedString, Token};
    use crate::{
        DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, SegmentReader,
-        TantivyDocument, Term, TERMINATED,
+        TantivyDocument, TantivyError, Term, TERMINATED,
    };

    #[test]
@@ -1136,4 +1170,87 @@ mod tests {
            "Schema error: 'Error getting tokenizer for field: title'"
        );
    }
+
+    /// Builds a `SegmentWriter` with a fast `u64` field and a text field that only some
+    /// documents populate, so the text field is missing fieldnorms on some docs.
+    ///
+    /// The `texts` slice provides, for each document, an optional text value. The order
+    /// number is always recorded in the `order` fast field so callers can recover the
+    /// original document via that value.
+    fn build_segment_writer_with_doc_id_mapping(
+        texts: &[Option<&str>],
+    ) -> (Index, crate::Segment, super::SegmentWriter) {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_u64_field("order", FAST | STORED);
+        schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let mut index = Index::create_in_ram(schema);
+        index.settings_mut().manual_doc_id_mapping = true;
+        let segment = index.new_segment();
+        let order = index.schema().get_field("order").unwrap();
+        let text = index.schema().get_field("text").unwrap();
+        let mut segment_writer =
+            super::SegmentWriter::for_segment(15_000_000, segment.clone()).unwrap();
+        for (opstamp, text_opt) in texts.iter().enumerate() {
+            let mut doc = TantivyDocument::default();
+            doc.add_u64(order, opstamp as u64);
+            if let Some(text_value) = text_opt {
+                doc.add_text(text, *text_value);
+            }
+            segment_writer
+                .add_document(crate::indexer::AddOperation {
+                    opstamp: opstamp as u64,
+                    document: doc,
+                })
+                .unwrap();
+        }
+        (index, segment, segment_writer)
+    }
+
+    #[test]
+    fn test_finalize_with_doc_id_mapping_rejects_wrong_length() {
+        let (_index, _segment, segment_writer) =
+            build_segment_writer_with_doc_id_mapping(&[Some("a"), Some("b"), Some("c")]);
+        // Mapping only covers 2 of the 3 documents.
+        let mapping = DocIdMapping::new_permutation(vec![1, 0]).unwrap();
+        let err = segment_writer
+            .finalize_with_doc_id_mapping(&mapping)
+            .unwrap_err();
+        assert!(
+            matches!(err, TantivyError::InvalidArgument(_)),
+            "unexpected error: {err:?}"
+        );
+    }
+
+    #[test]
+    fn test_finalize_with_doc_id_mapping_remaps_missing_fieldnorms() -> crate::Result<()> {
+        // doc 0: "alpha beta"  (2 tokens)
+        // doc 1: <no text>     (missing fieldnorm -> 0)
+        // doc 2: "gamma"       (1 token)
+        // doc 3: <no text>     (missing fieldnorm -> 0)
+        let (index, segment, segment_writer) = build_segment_writer_with_doc_id_mapping(&[
+            Some("alpha beta"),
+            None,
+            Some("gamma"),
+            None,
+        ]);
+        let max_doc = segment_writer.max_doc();
+
+        // Reverse the documents. New doc id i maps to old doc id (3 - i).
+        let mapping = DocIdMapping::new_permutation(vec![3, 2, 1, 0])?;
+        segment_writer.finalize_with_doc_id_mapping(&mapping)?;
+
+        let segment = segment.with_max_doc(max_doc);
+        let segment_reader = SegmentReader::open(&segment)?;
+        let text = index.schema().get_field("text").unwrap();
+        let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text)?;
+
+        // After remapping, fieldnorms follow the reversed order:
+        // new 0 <- old 3 (0), new 1 <- old 2 (1), new 2 <- old 1 (0), new 3 <- old 0 (2)
+        assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
+        assert_eq!(fieldnorm_reader.fieldnorm(1), 1);
+        assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
+        assert_eq!(fieldnorm_reader.fieldnorm(3), 2);
+        Ok(())
+    }
 }
--- a/src/indexer/single_segment_index_writer.rs
+++ b/src/indexer/single_segment_index_writer.rs
@@ -2,7 +2,7 @@ use std::marker::PhantomData;

 use crate::indexer::operation::AddOperation;
 use crate::indexer::segment_updater::save_metas;
-use crate::indexer::SegmentWriter;
+use crate::indexer::{DocIdMapping, SegmentWriter};
 use crate::schema::document::Document;
 use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};

@@ -38,9 +38,29 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
    }

    pub fn finalize(self) -> crate::Result<Index> {
-        let max_doc = self.segment_writer.max_doc();
-        self.segment_writer.finalize()?;
-        let segment: Segment = self.segment.with_max_doc(max_doc);
+        let Self {
+            segment,
+            segment_writer,
+            ..
+        } = self;
+        let max_doc = segment_writer.max_doc();
+        segment_writer.finalize()?;
+        Self::finalize_inner(segment, max_doc)
+    }
+
+    pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Index> {
+        let Self {
+            segment,
+            segment_writer,
+            ..
+        } = self;
+        let max_doc = segment_writer.max_doc();
+        segment_writer.finalize_with_doc_id_mapping(mapping)?;
+        Self::finalize_inner(segment, max_doc)
+    }
+
+    fn finalize_inner(segment: Segment, max_doc: u32) -> crate::Result<Index> {
+        let segment: Segment = segment.with_max_doc(max_doc);
        let index = segment.index();
        let index_meta = IndexMeta {
            index_settings: index.settings().clone(),
@@ -51,6 +71,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
        };
        save_metas(&index_meta, index.directory())?;
        index.directory().sync_directory()?;
-        Ok(segment.index().clone())
+        Ok(index.clone())
    }
 }
Author	SHA1	Message	Date
Luca Cominardi	3c6eb92c7e	fix: restore clone bound on doc id iterator Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-29 11:31:38 +02:00
Luca Cominardi	9f73d3fe54	docs: clarify manual doc id mapping serialization Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-29 11:15:53 +02:00
Luca Cominardi	50272240c6	test: move doc id mapping validation coverage Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-29 11:13:35 +02:00
Luca Cominardi	a6fd070e3d	fix: validate manual doc id mappings at construction Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-29 10:52:41 +02:00
Luca Cominardi	06c046bdc9	fix: enable manual doc id mapping in single segment test The regression test calls finalize_with_doc_id_mapping, so the index must opt into the temporary docstore path before constructing the segment writer. Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 17:56:54 +02:00
Luca Cominardi	63f65dcd71	fix: add missing manual_doc_id_mapping field in zstd test IndexSettings initializer Struct literal was missing the new field introduced in the manual doc id mapping refactor, causing a compile error under --all-features. Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 17:47:33 +02:00
Luca Cominardi	8a4c5b9013	refactor: gate manual doc id mapping via settings Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 17:37:59 +02:00
Luca Cominardi	f7355e60cd	feat: add single segment doc id mapping finalization Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 16:47:14 +02:00
Luca Cominardi	90603d2396	refactor custom doc id mapping finalization Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 16:24:19 +02:00
Luca Cominardi	910861a3e9	feat: add custom doc id mapping finalization Co-authored-by: Cursor <cursoragent@cursor.com>	2026-06-26 14:45:57 +02:00