Compare commits

...

1 Commits

Author SHA1 Message Date
Luca Cominardi
57a270393d feat: allow provided doc id mappings for segment serialization
Expose merge-time doc id mapping control and add a single-segment finalize path that accepts an explicit new-doc-id to old-doc-id permutation. This lets callers choose document order without adding a persistent sort field.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-25 18:31:58 +02:00
8 changed files with 353 additions and 32 deletions

View File

@@ -7,16 +7,20 @@ use super::SegmentWriter;
use crate::schema::{Field, Schema};
use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
/// Describes how the document ID mapping was produced during a merge.
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum MappingType {
/// Segments are concatenated in order with no deletes; doc IDs are contiguous ranges.
Stacked,
/// Segments are concatenated in order but some documents have been deleted and are skipped.
StackedWithDeletes,
/// Documents have been reordered, for instance by a sort field or by caller-provided order.
Shuffled,
}
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocIdMapping {
pub struct SegmentDocIdMapping {
pub(crate) new_doc_id_to_old_doc_addr: Vec<DocAddress>,
pub(crate) alive_bitsets: Vec<Option<ReadOnlyBitSet>>,
mapping_type: MappingType,
@@ -35,6 +39,24 @@ impl SegmentDocIdMapping {
}
}
/// Build a `Shuffled` mapping from an explicit permutation of [`DocAddress`]es.
///
/// `new_doc_id_to_old_doc_addr[new_id]` gives the source segment and doc id for
/// the document that should appear at position `new_id` in the merged segment.
/// `alive_bitsets` must contain one entry per source segment, in the same order
/// as the segments passed to [`IndexMerger::open_with_custom_alive_set`].
pub fn new_shuffled(
new_doc_id_to_old_doc_addr: Vec<DocAddress>,
alive_bitsets: Vec<Option<ReadOnlyBitSet>>,
) -> Self {
Self {
new_doc_id_to_old_doc_addr,
alive_bitsets,
mapping_type: MappingType::Shuffled,
}
}
/// Returns the [`MappingType`] that describes how this mapping was constructed.
pub fn mapping_type(&self) -> MappingType {
self.mapping_type
}
@@ -71,6 +93,7 @@ pub struct DocIdMapping {
}
impl DocIdMapping {
/// Constructs a [`DocIdMapping`] from a vector mapping each new doc ID to its old doc ID.
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
let max_doc = new_doc_id_to_old.len();
let old_max_doc = new_doc_id_to_old
@@ -102,6 +125,7 @@ impl DocIdMapping {
self.new_doc_id_to_old.iter().cloned()
}
/// Returns a slice mapping each old doc ID to its corresponding new doc ID.
pub fn old_to_new_ids(&self) -> &[DocId] {
&self.old_doc_id_to_new[..]
}
@@ -113,9 +137,11 @@ impl DocIdMapping {
.map(|old_doc| els[*old_doc as usize])
.collect()
}
/// Returns the number of new doc IDs in this mapping.
pub fn num_new_doc_ids(&self) -> usize {
self.new_doc_id_to_old.len()
}
/// Returns the number of old doc IDs covered by this mapping.
pub fn num_old_doc_ids(&self) -> usize {
self.old_doc_id_to_new.len()
}

View File

@@ -113,6 +113,7 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
Ok(total_num_tokens)
}
/// Merges multiple index segments into one segment.
pub struct IndexMerger {
index_settings: IndexSettings,
schema: Schema,
@@ -218,6 +219,7 @@ impl IndexMerger {
.any(|doc_id| col.first(doc_id).is_none())
}
/// Opens an [`IndexMerger`] over the given segments using their existing delete sets.
pub fn open(
schema: Schema,
index_settings: IndexSettings,
@@ -239,6 +241,9 @@ impl IndexMerger {
// This can be used to merge but also apply an additional filter.
// One use case is demux, which is basically taking a list of
// segments and partitions them e.g. by a value in a field.
/// Opens an [`IndexMerger`] with a custom alive set per segment.
///
/// Each entry in `alive_bitset_opt` corresponds to the segment at the same ordinal.
pub fn open_with_custom_alive_set(
schema: Schema,
index_settings: IndexSettings,
@@ -947,7 +952,7 @@ impl IndexMerger {
///
/// # Returns
/// The number of documents in the resulting segment.
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
pub fn write(&self, serializer: SegmentSerializer) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
@@ -958,6 +963,27 @@ impl IndexMerger {
} else {
self.get_doc_id_from_concatenated_data()?
};
self.write_with_mapping(serializer, doc_id_mapping)
}
/// Like [`IndexMerger::write`], but uses the caller-supplied `doc_id_mapping` instead of
/// deriving one from an index sort field.
///
/// The mapping must cover all live documents across every segment passed to
/// [`IndexMerger::open_with_custom_alive_set`].
pub fn write_with_doc_id_mapping(
&self,
serializer: SegmentSerializer,
doc_id_mapping: SegmentDocIdMapping,
) -> crate::Result<u32> {
self.write_with_mapping(serializer, doc_id_mapping)
}
fn write_with_mapping(
&self,
mut serializer: SegmentSerializer,
doc_id_mapping: SegmentDocIdMapping,
) -> crate::Result<u32> {
debug!("write-fieldnorms");
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;

View File

@@ -8,7 +8,7 @@
pub(crate) mod delete_queue;
pub(crate) mod path_to_unordered_id;
pub(crate) mod doc_id_mapping;
pub mod doc_id_mapping;
mod doc_opstamp_mapping;
mod flat_map_with_buffer;
pub(crate) mod index_writer;
@@ -17,7 +17,8 @@ pub(crate) mod indexing_term;
mod log_merge_policy;
mod merge_operation;
pub(crate) mod merge_policy;
pub(crate) mod merger;
/// Segment merger APIs for combining multiple existing segments.
pub mod merger;
mod merger_sorted_index_test;
pub(crate) mod operation;
pub(crate) mod prepared_commit;
@@ -33,15 +34,19 @@ mod stamper;
use crossbeam_channel as channel;
use smallvec::SmallVec;
pub use self::doc_id_mapping::SegmentDocIdMapping;
pub use self::index_writer::{advance_deletes, IndexWriter, IndexWriterOptions};
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::merger::IndexMerger;
pub use self::operation::{AddOperation, DeleteOperation, UserOperation};
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub(crate) use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
pub use self::segment_updater::{
merge_filtered_segments, merge_indices, merge_segments_with_doc_id_mapping,
};
pub use self::segment_writer::SegmentWriter;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;

View File

@@ -11,6 +11,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
store_is_temp: bool,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
@@ -18,14 +19,19 @@ pub struct SegmentSerializer {
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(
mut segment: Segment,
is_in_merge: bool,
) -> crate::Result<SegmentSerializer> {
pub fn for_segment(segment: Segment, is_in_merge: bool) -> crate::Result<SegmentSerializer> {
// If the segment is going to be sorted, we stream the docs first to a temporary file.
// In the merge case this is not necessary because we can kmerge the already sorted
// segments
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
Self::for_segment_with_remapping_required(segment, remapping_required)
}
/// Creates a new `SegmentSerializer` with an explicit remapping requirement.
pub fn for_segment_with_remapping_required(
mut segment: Segment,
remapping_required: bool,
) -> crate::Result<SegmentSerializer> {
let settings = segment.index().settings().clone();
let store_writer = if remapping_required {
let store_write = segment.open_write(SegmentComponent::TempStore)?;
@@ -57,6 +63,7 @@ impl SegmentSerializer {
Ok(SegmentSerializer {
segment,
store_writer,
store_is_temp: remapping_required,
fast_field_write,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
@@ -76,6 +83,10 @@ impl SegmentSerializer {
&mut self.segment
}
pub fn store_is_temp(&self) -> bool {
self.store_is_temp
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer

View File

@@ -15,6 +15,7 @@ use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::fastfield::AliveBitSet;
use crate::index::{Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta};
use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::indexer::index_writer::advance_deletes;
use crate::indexer::merge_operation::MergeOperationInventory;
use crate::indexer::merger::IndexMerger;
@@ -255,6 +256,81 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
Ok(merged_index)
}
/// Like [`merge_filtered_segments`], but uses a caller-supplied [`SegmentDocIdMapping`]
/// to control the final document order.
///
/// The mapping should be built from the same segments, in the same order, passed here.
///
/// # Warning
/// Same caveats as [`merge_filtered_segments`]: no live `IndexWriter` is allowed.
#[doc(hidden)]
pub fn merge_segments_with_doc_id_mapping<T: Into<Box<dyn Directory>>>(
segments: &[Segment],
target_settings: IndexSettings,
filter_doc_ids: Vec<Option<AliveBitSet>>,
doc_id_mapping: SegmentDocIdMapping,
output_directory: T,
) -> crate::Result<Index> {
if segments.is_empty() {
return Err(crate::TantivyError::InvalidArgument(
"No segments given to merge".to_string(),
));
}
let target_schema = segments[0].schema();
if segments
.iter()
.skip(1)
.any(|segment| segment.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
let mut merged_index = Index::create(
output_directory,
target_schema.clone(),
target_settings.clone(),
)?;
let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id();
let merger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
segments,
filter_doc_ids,
)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write_with_doc_id_mapping(segment_serializer, doc_id_mapping)?;
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
let stats = format!(
"Segments Merge (external reordering): [{}]",
segments
.iter()
.fold(String::new(), |sum, current| format!(
"{sum}{} ",
current.meta().id().uuid_string()
))
.trim_end()
);
let index_meta = IndexMeta {
index_settings: target_settings,
segments: vec![segment_meta],
schema: target_schema,
opstamp: 0u64,
payload: Some(stats),
};
save_metas(&index_meta, merged_index.directory_mut())?;
Ok(merged_index)
}
pub(crate) struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file every time we need it in the

View File

@@ -18,7 +18,9 @@ use crate::postings::{
use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::tokenizer::{
FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer, TokenizerManager,
};
use crate::{DocId, Opstamp, TantivyError};
/// Computes the initial size of the hash table.
@@ -90,8 +92,42 @@ impl SegmentWriter {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
Self::for_segment_serializer(
memory_budget_in_bytes,
schema,
tokenizer_manager,
tokenizer_manager_fast_field,
segment_serializer,
)
}
pub(crate) fn for_segment_with_provided_doc_id_mapping(
memory_budget_in_bytes: usize,
segment: Segment,
) -> crate::Result<Self> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
let segment_serializer =
SegmentSerializer::for_segment_with_remapping_required(segment, true)?;
Self::for_segment_serializer(
memory_budget_in_bytes,
schema,
tokenizer_manager,
tokenizer_manager_fast_field,
segment_serializer,
)
}
fn for_segment_serializer(
memory_budget_in_bytes: usize,
schema: Schema,
tokenizer_manager: TokenizerManager,
tokenizer_manager_fast_field: TokenizerManager,
segment_serializer: SegmentSerializer,
) -> crate::Result<Self> {
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
let per_field_text_analyzers = schema
.fields()
@@ -149,6 +185,31 @@ impl SegmentWriter {
.clone()
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
self.finalize_with_mapping(mapping.as_ref())
}
/// Lay on disk the current content of the `SegmentWriter`, using a caller-provided document
/// order.
///
/// `new_doc_id_to_old_doc_id[new_id]` is the old document id of the document that should be
/// serialized at `new_id`.
pub fn finalize_with_doc_id_mapping(
mut self,
new_doc_id_to_old_doc_id: Vec<DocId>,
) -> crate::Result<Vec<u64>> {
if new_doc_id_to_old_doc_id.len() != self.max_doc as usize {
return Err(crate::TantivyError::InvalidArgument(format!(
"provided doc id mapping length {} does not match segment max_doc {}",
new_doc_id_to_old_doc_id.len(),
self.max_doc
)));
}
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
let mapping = DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old_doc_id);
self.finalize_with_mapping(Some(&mapping))
}
fn finalize_with_mapping(self, mapping: Option<&DocIdMapping>) -> crate::Result<Vec<u64>> {
remap_and_write(
self.schema,
&self.per_field_postings_writers,
@@ -156,9 +217,10 @@ impl SegmentWriter {
self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
mapping.as_ref(),
self.max_doc,
mapping,
)?;
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping);
Ok(doc_opstamps)
}
@@ -420,6 +482,7 @@ fn remap_and_write(
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer,
max_doc: DocId,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<()> {
debug!("remap-and-write");
@@ -441,9 +504,10 @@ fn remap_and_write(
debug!("fastfield-serialize");
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {
debug!("resort-docstore");
// Finalize the temp docstore and create the final store. With a mapping, the final store
// reflects the new doc id order; without one, it preserves the insertion order.
if serializer.store_is_temp() {
debug!("rewrite-docstore");
let store_write = serializer
.segment_mut()
.open_write(SegmentComponent::Store)?;
@@ -463,7 +527,10 @@ fn remap_and_write(
1, /* The docstore is configured to have one doc per block, and each doc is
* accessed only once: we don't need caching. */
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let old_doc_ids = doc_id_map
.map(|doc_id_map| doc_id_map.iter_old_doc_ids().collect::<Vec<_>>())
.unwrap_or_else(|| (0..max_doc).collect::<Vec<_>>());
for old_doc_id in old_doc_ids {
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}

View File

@@ -4,7 +4,7 @@ use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
use crate::{Directory, DocId, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
@@ -17,7 +17,8 @@ pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
impl<D: Document> SingleSegmentIndexWriter<D> {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
let segment_writer =
SegmentWriter::for_segment_with_provided_doc_id_mapping(mem_budget, segment.clone())?;
Ok(Self {
segment_writer,
segment,
@@ -40,17 +41,123 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
pub fn finalize(self) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: vec![segment.meta().clone()],
schema: index.schema(),
opstamp: 0,
payload: None,
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
finalize_segment(self.segment, max_doc)
}
/// Finalizes this single-segment index using a caller-provided document order.
///
/// `new_doc_id_to_old_doc_id[new_id]` is the old insertion doc id of the document that should
/// be serialized at `new_id`.
pub fn finalize_with_doc_id_mapping(
self,
new_doc_id_to_old_doc_id: Vec<DocId>,
) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer
.finalize_with_doc_id_mapping(new_doc_id_to_old_doc_id)?;
finalize_segment(self.segment, max_doc)
}
}
fn finalize_segment(segment: Segment, max_doc: DocId) -> crate::Result<Index> {
let segment: Segment = segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: vec![segment.meta().clone()],
schema: index.schema(),
opstamp: 0,
payload: None,
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
}
#[cfg(test)]
mod tests {
use crate::collector::TopDocs;
use crate::directory::RamDirectory;
use crate::query::QueryParser;
use crate::schema::{
IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions, Value, STORED,
};
use crate::{Index, ReloadPolicy, TantivyDocument};
#[test]
fn test_finalize_with_doc_id_mapping() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", NumericOptions::default().set_fast());
let text_field = schema_builder.add_text_field(
"text",
TextOptions::default().set_stored().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqs)
.set_fieldnorms(true),
),
);
let stored_field = schema_builder.add_text_field("stored", STORED);
let schema = schema_builder.build();
let mut writer = Index::builder()
.schema(schema)
.single_segment_index_writer(RamDirectory::create(), 15_000_000)?;
writer.add_document(doc!(
id_field => 10u64,
text_field => "alpha beta",
stored_field => "old-0",
))?;
writer.add_document(doc!(
id_field => 20u64,
text_field => "alpha",
stored_field => "old-1",
))?;
writer.add_document(doc!(
id_field => 30u64,
text_field => "beta",
stored_field => "old-2",
))?;
let index = writer.finalize_with_doc_id_mapping(vec![2, 0, 1])?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_field = segment_reader
.fast_fields()
.u64("id")?
.first_or_default_col(0);
assert_eq!(fast_field.get_val(0), 30u64);
assert_eq!(fast_field.get_val(1), 10u64);
assert_eq!(fast_field.get_val(2), 20u64);
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 1);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1);
let mut stored_values = Vec::new();
for doc_id in 0..segment_reader.max_doc() {
let doc: TantivyDocument = segment_reader.get_store_reader(1024)?.get(doc_id)?;
let stored_value = doc
.get_first(stored_field)
.and_then(|value| value.as_str())
.unwrap();
stored_values.push(stored_value.to_string());
}
assert_eq!(stored_values, ["old-2", "old-0", "old-1"]);
let query = QueryParser::for_index(&index, vec![text_field]).parse_query("beta")?;
let top_docs: Vec<(_, _)> =
searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
let doc_ids = top_docs
.into_iter()
.map(|(_, doc_address)| doc_address.doc_id)
.collect::<Vec<_>>();
assert_eq!(doc_ids, [0, 1]);
Ok(())
}
}

View File

@@ -229,7 +229,10 @@ pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
Segment, SegmentMeta, SegmentReader,
};
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::indexer::{
merge_segments_with_doc_id_mapping, IndexMerger, IndexWriter, SegmentDocIdMapping,
SingleSegmentIndexWriter,
};
pub use crate::schema::{Document, TantivyDocument, Term};
/// Index format version.