Compare commits

..

6 Commits

Author SHA1 Message Date
Luca Cominardi
06c046bdc9 fix: enable manual doc id mapping in single segment test
The regression test calls finalize_with_doc_id_mapping, so the index must opt into the temporary docstore path before constructing the segment writer.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 17:56:54 +02:00
Luca Cominardi
63f65dcd71 fix: add missing manual_doc_id_mapping field in zstd test IndexSettings initializer
Struct literal was missing the new field introduced in the manual doc id
mapping refactor, causing a compile error under --all-features.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 17:47:33 +02:00
Luca Cominardi
8a4c5b9013 refactor: gate manual doc id mapping via settings
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 17:37:59 +02:00
Luca Cominardi
f7355e60cd feat: add single segment doc id mapping finalization
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 16:47:14 +02:00
Luca Cominardi
90603d2396 refactor custom doc id mapping finalization
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 16:24:19 +02:00
Luca Cominardi
910861a3e9 feat: add custom doc id mapping finalization
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-26 14:45:57 +02:00
10 changed files with 296 additions and 36 deletions

View File

@@ -44,6 +44,6 @@ jobs:
# Upload the results to GitHub's code scanning dashboard.
- name: 'Upload to code-scanning'
uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
uses: github/codeql-action/upload-sarif@87557b9c84dde89fdd9b10e88954ac2f4248e463 # v4.36.1
with:
sarif_file: results.sarif

View File

@@ -281,12 +281,16 @@ impl BitSet {
}
/// Inserts an element in the `BitSet`
///
/// Returns true if the set changed.
#[inline]
pub fn insert(&mut self, el: u32) {
pub fn insert(&mut self, el: u32) -> bool {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
let changed = self.tinysets[higher as usize].insert_mut(lower);
self.len += u64::from(changed);
changed
}
/// Inserts an element in the `BitSet`

View File

@@ -931,7 +931,9 @@ fn build_allowed_term_ids_for_str(
// add matches
allowed = Some(BitSet::with_max_value(allowed_capacity));
let allowed = allowed.as_mut().unwrap();
for_each_matching_term_ord(str_col, include, |ord| allowed.insert(ord))?;
for_each_matching_term_ord(str_col, include, |ord| {
let _ = allowed.insert(ord);
})?;
};
if let Some(exclude) = exclude {

View File

@@ -1,14 +1,14 @@
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::index::SegmentId;
use crate::indexer::{LogMergePolicy, NoMergePolicy};
use crate::indexer::{DocIdMapping, LogMergePolicy, NoMergePolicy};
use crate::postings::Postings;
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, Value, INDEXED, STORED, STRING, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
TantivyDocument, Term,
Directory, DocAddress, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter,
ReloadPolicy, TantivyDocument, Term,
};
#[test]
@@ -300,6 +300,49 @@ fn test_single_segment_index_writer() -> crate::Result<()> {
Ok(())
}
#[test]
fn test_single_segment_index_writer_with_doc_id_mapping() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT | STORED);
let schema = schema_builder.build();
let directory = RamDirectory::default();
let mut settings = IndexSettings::default();
settings.manual_doc_id_mapping = true;
let mut single_segment_index_writer = Index::builder()
.schema(schema)
.settings(settings)
.single_segment_index_writer(directory, 15_000_000)?;
single_segment_index_writer.add_document(doc!(text_field=>"alpha beta"))?;
single_segment_index_writer.add_document(doc!())?;
single_segment_index_writer.add_document(doc!(text_field=>"gamma"))?;
let mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 1, 0]);
let index = single_segment_index_writer.finalize_with_doc_id_mapping(&mapping)?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 1);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2);
let doc_0 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(
doc_0.get_first(text_field).and_then(|val| val.as_str()),
Some("gamma")
);
let doc_1 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert!(doc_1.get_first(text_field).is_none());
let doc_2 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(
doc_2.get_first(text_field).and_then(|val| val.as_str()),
Some("alpha beta")
);
Ok(())
}
#[test]
fn test_merging_segment_update_docfreq() {
let mut schema_builder = Schema::builder();

View File

@@ -250,6 +250,10 @@ pub struct IndexSettings {
/// provided in `IndexSortByField`
#[serde(skip_serializing_if = "Option::is_none")]
pub sort_by_field: Option<IndexSortByField>,
/// If true, enables caller-provided doc id mappings at segment finalization time.
#[doc(hidden)]
#[serde(skip)]
pub manual_doc_id_mapping: bool,
/// The `Compressor` used to compress the doc store.
#[serde(default)]
pub docstore_compression: Compressor,
@@ -273,6 +277,7 @@ impl Default for IndexSettings {
fn default() -> Self {
Self {
sort_by_field: None,
manual_doc_id_mapping: false,
docstore_compression: Compressor::default(),
docstore_blocksize: default_docstore_blocksize(),
docstore_compress_dedicated_thread: true,
@@ -460,6 +465,7 @@ mod tests {
field: "text".to_string(),
order: Order::Asc,
}),
manual_doc_id_mapping: false,
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
compression_level: Some(4),
}),
@@ -529,6 +535,7 @@ mod tests {
index_settings,
IndexSettings {
sort_by_field: None,
manual_doc_id_mapping: false,
docstore_compression: Compressor::default(),
docstore_compress_dedicated_thread: true,
docstore_blocksize: 16_384
@@ -547,6 +554,18 @@ mod tests {
serde_json::from_value(index_settings_json).unwrap();
assert_eq!(index_settings_deser, index_settings);
}
{
index_settings.manual_doc_id_mapping = true;
let index_settings_json = serde_json::to_value(&index_settings).unwrap();
assert_eq!(
index_settings_json,
serde_json::json!({
"docstore_compression": "lz4",
"docstore_blocksize": 16384
})
);
index_settings.manual_doc_id_mapping = false;
}
{
index_settings.docstore_compress_dedicated_thread = false;
let index_settings_json = serde_json::to_value(&index_settings).unwrap();

View File

@@ -71,6 +71,10 @@ pub struct DocIdMapping {
}
impl DocIdMapping {
/// Creates a `DocIdMapping` from a mapping of new doc ids to old doc ids.
///
/// The caller MUST ensure that `new_doc_id_to_old` is a permutation of the
/// segment's old doc ids, with every old doc id appearing exactly once.
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
let max_doc = new_doc_id_to_old.len();
let old_max_doc = new_doc_id_to_old
@@ -90,34 +94,39 @@ impl DocIdMapping {
}
/// returns the new doc_id for the old doc_id
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
pub(crate) fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
self.old_doc_id_to_new[doc_id as usize]
}
/// returns the old doc_id for the new doc_id
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
self.new_doc_id_to_old[doc_id as usize]
}
/// iterate over old doc_ids in order of the new doc_ids
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
pub(crate) fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned()
}
pub fn old_to_new_ids(&self) -> &[DocId] {
/// returns the new doc_ids in order of the old doc_ids
pub(crate) fn old_to_new_ids(&self) -> &[DocId] {
&self.old_doc_id_to_new[..]
}
/// Remaps a given array to the new doc ids.
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
pub(crate) fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
self.new_doc_id_to_old
.iter()
.map(|old_doc| els[*old_doc as usize])
.collect()
}
pub fn num_new_doc_ids(&self) -> usize {
/// returns the number of new doc_ids
pub(crate) fn len(&self) -> usize {
self.new_doc_id_to_old.len()
}
pub fn num_old_doc_ids(&self) -> usize {
self.old_doc_id_to_new.len()
}
#[cfg(test)]
impl DocIdMapping {
/// returns the old doc_id for the new doc_id
fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
self.new_doc_id_to_old[doc_id as usize]
}
}

View File

@@ -33,6 +33,7 @@ mod stamper;
use crossbeam_channel as channel;
use smallvec::SmallVec;
pub use self::doc_id_mapping::DocIdMapping;
pub use self::index_writer::{advance_deletes, IndexWriter, IndexWriterOptions};
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;

View File

@@ -4,7 +4,7 @@ use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormsSerializer;
use crate::index::{Segment, SegmentComponent};
use crate::postings::InvertedIndexSerializer;
use crate::store::StoreWriter;
use crate::store::{Compressor, StoreWriter};
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
@@ -25,17 +25,18 @@ impl SegmentSerializer {
// If the segment is going to be sorted, we stream the docs first to a temporary file.
// In the merge case this is not necessary because we can kmerge the already sorted
// segments
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
let settings = segment.index().settings().clone();
let remapping_required =
(settings.sort_by_field.is_some() || settings.manual_doc_id_mapping) && !is_in_merge;
let store_writer = if remapping_required {
let store_write = segment.open_write(SegmentComponent::TempStore)?;
StoreWriter::new(
store_write,
crate::store::Compressor::None,
Compressor::None,
// We want fast random access on the docs, so we choose a small block size.
// If this is zero, the skip index will contain too many checkpoints and
// therefore will be relatively slow.
16000,
16_000,
settings.docstore_compress_dedicated_thread,
)?
} else {

View File

@@ -1,5 +1,5 @@
use columnar::MonotonicallyMappableToU64;
use common::JsonPathWriter;
use common::{BitSet, JsonPathWriter};
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
@@ -136,10 +136,8 @@ impl SegmentWriter {
/// Lay on disk the current content of the `SegmentWriter`
///
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
/// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
pub fn finalize(self) -> crate::Result<Vec<u64>> {
let mapping: Option<DocIdMapping> = self
.segment_serializer
.segment()
@@ -149,6 +147,54 @@ impl SegmentWriter {
.clone()
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
self.finalize_inner(mapping.as_ref())
}
/// Lay on disk the current content of the `SegmentWriter` using the given doc id mapping.
///
/// The mapping must cover all documents in this segment and maps the segment's original doc ids
/// to the doc ids that should be written on disk.
///
/// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Vec<u64>> {
// Ensure the segment writer was created in remap mode so the docstore can be reordered.
if !self
.segment_serializer
.segment()
.index()
.settings()
.manual_doc_id_mapping
{
return Err(TantivyError::InvalidArgument(
"IndexSettings::manual_doc_id_mapping must be set to true".to_string(),
));
}
// Check that the mapping eventually covers all documents in the segment.
if mapping.len() != self.max_doc as usize {
return Err(TantivyError::InvalidArgument(format!(
"Mapping must cover all documents in this segment. Expected {} documents, got {}",
self.max_doc,
mapping.len()
)));
}
// Check that the mapping is a permutation of the segment doc ids.
let mut seen_doc_ids = BitSet::with_max_value(self.max_doc);
for old_doc_id in mapping.iter_old_doc_ids() {
if old_doc_id >= self.max_doc || !seen_doc_ids.insert(old_doc_id) {
return Err(TantivyError::InvalidArgument(
"Mapping must be a permutation of the segment doc ids".to_string(),
));
}
}
self.finalize_inner(Some(mapping))
}
fn finalize_inner(mut self, mapping: Option<&DocIdMapping>) -> crate::Result<Vec<u64>> {
// Pad before remapping; the mapping indexes fieldnorms by old doc id.
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
remap_and_write(
self.schema,
&self.per_field_postings_writers,
@@ -156,9 +202,9 @@ impl SegmentWriter {
self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
mapping.as_ref(),
mapping,
)?;
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping);
Ok(doc_opstamps)
}
@@ -485,6 +531,7 @@ mod tests {
use crate::collector::{Count, TopDocs};
use crate::directory::RamDirectory;
use crate::fastfield::FastValue;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{
@@ -497,7 +544,7 @@ mod tests {
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, SegmentReader,
TantivyDocument, Term, TERMINATED,
TantivyDocument, TantivyError, Term, TERMINATED,
};
#[test]
@@ -1136,4 +1183,118 @@ mod tests {
"Schema error: 'Error getting tokenizer for field: title'"
);
}
/// Builds a `SegmentWriter` with a fast `u64` field and a text field that only some
/// documents populate, so the text field is missing fieldnorms on some docs.
///
/// The `texts` slice provides, for each document, an optional text value. The order
/// number is always recorded in the `order` fast field so callers can recover the
/// original document via that value.
fn build_segment_writer_with_doc_id_mapping(
texts: &[Option<&str>],
) -> (Index, crate::Segment, super::SegmentWriter) {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("order", FAST | STORED);
schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut index = Index::create_in_ram(schema);
index.settings_mut().manual_doc_id_mapping = true;
let segment = index.new_segment();
let order = index.schema().get_field("order").unwrap();
let text = index.schema().get_field("text").unwrap();
let mut segment_writer =
super::SegmentWriter::for_segment(15_000_000, segment.clone()).unwrap();
for (opstamp, text_opt) in texts.iter().enumerate() {
let mut doc = TantivyDocument::default();
doc.add_u64(order, opstamp as u64);
if let Some(text_value) = text_opt {
doc.add_text(text, *text_value);
}
segment_writer
.add_document(crate::indexer::AddOperation {
opstamp: opstamp as u64,
document: doc,
})
.unwrap();
}
(index, segment, segment_writer)
}
#[test]
fn test_finalize_with_doc_id_mapping_rejects_wrong_length() {
let (_index, _segment, segment_writer) =
build_segment_writer_with_doc_id_mapping(&[Some("a"), Some("b"), Some("c")]);
// Mapping only covers 2 of the 3 documents.
let mapping = DocIdMapping::from_new_id_to_old_id(vec![1, 0]);
let err = segment_writer
.finalize_with_doc_id_mapping(&mapping)
.unwrap_err();
assert!(
matches!(err, TantivyError::InvalidArgument(_)),
"unexpected error: {err:?}"
);
}
#[test]
fn test_finalize_with_doc_id_mapping_rejects_out_of_range() {
let (_index, _segment, segment_writer) =
build_segment_writer_with_doc_id_mapping(&[Some("a"), Some("b")]);
// Doc id 5 does not exist in this segment.
let mapping = DocIdMapping::from_new_id_to_old_id(vec![5, 0]);
let err = segment_writer
.finalize_with_doc_id_mapping(&mapping)
.unwrap_err();
assert!(
matches!(err, TantivyError::InvalidArgument(_)),
"unexpected error: {err:?}"
);
}
#[test]
fn test_finalize_with_doc_id_mapping_rejects_duplicates() {
let (_index, _segment, segment_writer) =
build_segment_writer_with_doc_id_mapping(&[Some("a"), Some("b"), Some("c")]);
// Old doc id 0 appears twice while doc id 2 is missing. The length still matches
// `max_doc`, so this must be caught by the permutation check.
let mapping = DocIdMapping::from_new_id_to_old_id(vec![0, 1, 0]);
let err = segment_writer
.finalize_with_doc_id_mapping(&mapping)
.unwrap_err();
assert!(
matches!(err, TantivyError::InvalidArgument(_)),
"unexpected error: {err:?}"
);
}
#[test]
fn test_finalize_with_doc_id_mapping_remaps_missing_fieldnorms() -> crate::Result<()> {
// doc 0: "alpha beta" (2 tokens)
// doc 1: <no text> (missing fieldnorm -> 0)
// doc 2: "gamma" (1 token)
// doc 3: <no text> (missing fieldnorm -> 0)
let (index, segment, segment_writer) = build_segment_writer_with_doc_id_mapping(&[
Some("alpha beta"),
None,
Some("gamma"),
None,
]);
let max_doc = segment_writer.max_doc();
// Reverse the documents. New doc id i maps to old doc id (3 - i).
let mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 1, 0]);
segment_writer.finalize_with_doc_id_mapping(&mapping)?;
let segment = segment.with_max_doc(max_doc);
let segment_reader = SegmentReader::open(&segment)?;
let text = index.schema().get_field("text").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text)?;
// After remapping, fieldnorms follow the reversed order:
// new 0 <- old 3 (0), new 1 <- old 2 (1), new 2 <- old 1 (0), new 3 <- old 0 (2)
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 1);
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
assert_eq!(fieldnorm_reader.fieldnorm(3), 2);
Ok(())
}
}

View File

@@ -2,7 +2,7 @@ use std::marker::PhantomData;
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::indexer::{DocIdMapping, SegmentWriter};
use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
@@ -38,9 +38,29 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
}
pub fn finalize(self) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let Self {
segment,
segment_writer,
..
} = self;
let max_doc = segment_writer.max_doc();
segment_writer.finalize()?;
Self::finalize_inner(segment, max_doc)
}
pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Index> {
let Self {
segment,
segment_writer,
..
} = self;
let max_doc = segment_writer.max_doc();
segment_writer.finalize_with_doc_id_mapping(mapping)?;
Self::finalize_inner(segment, max_doc)
}
fn finalize_inner(segment: Segment, max_doc: u32) -> crate::Result<Index> {
let segment: Segment = segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
@@ -51,6 +71,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
Ok(index.clone())
}
}