mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-22 18:30:41 +00:00
Compare commits
4 Commits
mallets/so
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
384f31d350 | ||
|
|
1e859fd78d | ||
|
|
f451fa938f | ||
|
|
2a82dd6f64 |
@@ -54,6 +54,6 @@ pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str)
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
ColumnarReader::open(wrt).unwrap()
|
||||
}
|
||||
|
||||
@@ -275,7 +275,7 @@ impl SegmentCompositeCollector {
|
||||
dict.insert(
|
||||
key,
|
||||
IntermediateCompositeBucketEntry {
|
||||
doc_count: agg.count,
|
||||
doc_count: agg.count as u64,
|
||||
sub_aggregation: sub_aggregation_res,
|
||||
},
|
||||
);
|
||||
|
||||
@@ -957,7 +957,7 @@ fn into_intermediate_bucket_entry(
|
||||
)?;
|
||||
}
|
||||
Ok(IntermediateTermBucketEntry {
|
||||
doc_count: bucket.count,
|
||||
doc_count: bucket.count as u64,
|
||||
sub_aggregation: sub_aggregation_res,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
||||
|
||||
let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
|
||||
let mut missing_entry = IntermediateTermBucketEntry {
|
||||
doc_count: missing_count.missing_count,
|
||||
doc_count: missing_count.missing_count as u64,
|
||||
sub_aggregation: Default::default(),
|
||||
};
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
|
||||
@@ -930,7 +930,7 @@ impl IntermediateRangeBucketEntry {
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateTermBucketEntry {
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u32,
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
@@ -1240,6 +1240,24 @@ mod tests {
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_bucket_doc_count_no_u32_overflow() {
|
||||
// Two segments each contributing (u32::MAX - 100) docs to the same term. Summing them
|
||||
// overflowed when doc_count was u32.
|
||||
let per_segment = u32::MAX as u64 - 100;
|
||||
let mut entry = IntermediateTermBucketEntry {
|
||||
doc_count: per_segment,
|
||||
sub_aggregation: Default::default(),
|
||||
};
|
||||
entry
|
||||
.merge_fruits(IntermediateTermBucketEntry {
|
||||
doc_count: per_segment,
|
||||
sub_aggregation: Default::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(entry.doc_count, per_segment * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_empty() {
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
|
||||
@@ -7,24 +7,16 @@ use super::SegmentWriter;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
|
||||
|
||||
/// Describes how the document ID mapping was produced during a merge.
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum MappingType {
|
||||
/// Segments are concatenated in order with no deletes; doc IDs are contiguous ranges.
|
||||
Stacked,
|
||||
/// Segments are concatenated in order but some documents have been deleted and are skipped.
|
||||
StackedWithDeletes,
|
||||
/// Documents have been reordered (e.g. sorted by a field or externally shuffled).
|
||||
Shuffled,
|
||||
}
|
||||
|
||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||
///
|
||||
/// Callers outside tantivy (e.g. pomsky's merge executor) can construct a
|
||||
/// `Shuffled` mapping directly from a precomputed permutation and pass it
|
||||
/// into [`IndexMerger::write_with_doc_id_mapping`].
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentDocIdMapping {
|
||||
pub(crate) struct SegmentDocIdMapping {
|
||||
pub(crate) new_doc_id_to_old_doc_addr: Vec<DocAddress>,
|
||||
pub(crate) alive_bitsets: Vec<Option<ReadOnlyBitSet>>,
|
||||
mapping_type: MappingType,
|
||||
@@ -43,25 +35,6 @@ impl SegmentDocIdMapping {
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a `Shuffled` mapping from an explicit permutation of [`DocAddress`]es.
|
||||
///
|
||||
/// `new_doc_id_to_old_doc_addr[new_id]` gives the source segment and doc id for
|
||||
/// the document that should appear at position `new_id` in the merged segment.
|
||||
/// `alive_bitsets` must contain one entry per source segment (in the same order
|
||||
/// as passed to [`IndexMerger::open_with_custom_alive_set`]), each `None` if that
|
||||
/// segment has no deletes.
|
||||
pub fn new_shuffled(
|
||||
new_doc_id_to_old_doc_addr: Vec<DocAddress>,
|
||||
alive_bitsets: Vec<Option<ReadOnlyBitSet>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
new_doc_id_to_old_doc_addr,
|
||||
mapping_type: MappingType::Shuffled,
|
||||
alive_bitsets,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the [`MappingType`] that describes how this mapping was constructed.
|
||||
pub fn mapping_type(&self) -> MappingType {
|
||||
self.mapping_type
|
||||
}
|
||||
@@ -91,14 +64,13 @@ impl SegmentDocIdMapping {
|
||||
}
|
||||
}
|
||||
|
||||
/// Bidirectional mapping between old and new doc IDs within a single segment.
|
||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
|
||||
pub struct DocIdMapping {
|
||||
new_doc_id_to_old: Vec<DocId>,
|
||||
old_doc_id_to_new: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl DocIdMapping {
|
||||
/// Constructs a [`DocIdMapping`] from a vector mapping each new doc ID to its old doc ID.
|
||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let old_max_doc = new_doc_id_to_old
|
||||
@@ -130,7 +102,6 @@ impl DocIdMapping {
|
||||
self.new_doc_id_to_old.iter().cloned()
|
||||
}
|
||||
|
||||
/// Returns a slice mapping each old doc ID to its corresponding new doc ID.
|
||||
pub fn old_to_new_ids(&self) -> &[DocId] {
|
||||
&self.old_doc_id_to_new[..]
|
||||
}
|
||||
@@ -142,11 +113,9 @@ impl DocIdMapping {
|
||||
.map(|old_doc| els[*old_doc as usize])
|
||||
.collect()
|
||||
}
|
||||
/// Returns the number of new (post-sort) doc IDs in this mapping.
|
||||
pub fn num_new_doc_ids(&self) -> usize {
|
||||
self.new_doc_id_to_old.len()
|
||||
}
|
||||
/// Returns the number of old (pre-sort) doc IDs covered by this mapping.
|
||||
pub fn num_old_doc_ids(&self) -> usize {
|
||||
self.old_doc_id_to_new.len()
|
||||
}
|
||||
|
||||
@@ -113,7 +113,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
|
||||
Ok(total_num_tokens)
|
||||
}
|
||||
|
||||
/// Merges multiple index segments into a single new segment.
|
||||
pub struct IndexMerger {
|
||||
index_settings: IndexSettings,
|
||||
schema: Schema,
|
||||
@@ -219,7 +218,6 @@ impl IndexMerger {
|
||||
.any(|doc_id| col.first(doc_id).is_none())
|
||||
}
|
||||
|
||||
/// Opens an [`IndexMerger`] over the given segments using their existing delete sets.
|
||||
pub fn open(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
@@ -229,14 +227,18 @@ impl IndexMerger {
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
|
||||
}
|
||||
|
||||
/// Opens an [`IndexMerger`] with a custom alive (delete) set per segment.
|
||||
///
|
||||
/// For every segment, an optional [`AliveBitSet`] can be provided which is intersected
|
||||
/// with the segment's existing alive set. Pass `None` for a segment to use its existing
|
||||
/// delete set unchanged.
|
||||
///
|
||||
/// This allows merging while also applying an additional filter, for example to demux
|
||||
/// documents by a field value into separate output segments.
|
||||
// Create merge with a custom delete set.
|
||||
// For every Segment, a delete bitset can be provided, which
|
||||
// will be merged with the existing bit set. Make sure the index
|
||||
// corresponds to the segment index.
|
||||
//
|
||||
// If `None` is provided for custom alive set, the regular alive set will be used.
|
||||
// If a alive_bitset is provided, the union between the provided and regular
|
||||
// alive set will be used.
|
||||
//
|
||||
// This can be used to merge but also apply an additional filter.
|
||||
// One use case is demux, which is basically taking a list of
|
||||
// segments and partitions them e.g. by a value in a field.
|
||||
pub fn open_with_custom_alive_set(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
@@ -945,7 +947,7 @@ impl IndexMerger {
|
||||
///
|
||||
/// # Returns
|
||||
/// The number of documents in the resulting segment.
|
||||
pub fn write(&self, serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
||||
{
|
||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
||||
@@ -956,31 +958,6 @@ impl IndexMerger {
|
||||
} else {
|
||||
self.get_doc_id_from_concatenated_data()?
|
||||
};
|
||||
self.write_with_mapping(serializer, doc_id_mapping)
|
||||
}
|
||||
|
||||
/// Like [`write`], but uses the caller-supplied `doc_id_mapping` instead of
|
||||
/// deriving one from an index sort field.
|
||||
///
|
||||
/// The mapping must cover *all* live documents across every segment passed to
|
||||
/// [`IndexMerger::open_with_custom_alive_set`]. The simplest way to build one
|
||||
/// is [`SegmentDocIdMapping::new_shuffled`].
|
||||
///
|
||||
/// # Returns
|
||||
/// The number of documents in the resulting segment.
|
||||
pub fn write_with_doc_id_mapping(
|
||||
&self,
|
||||
serializer: SegmentSerializer,
|
||||
doc_id_mapping: SegmentDocIdMapping,
|
||||
) -> crate::Result<u32> {
|
||||
self.write_with_mapping(serializer, doc_id_mapping)
|
||||
}
|
||||
|
||||
fn write_with_mapping(
|
||||
&self,
|
||||
mut serializer: SegmentSerializer,
|
||||
doc_id_mapping: SegmentDocIdMapping,
|
||||
) -> crate::Result<u32> {
|
||||
debug!("write-fieldnorms");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
pub(crate) mod delete_queue;
|
||||
pub(crate) mod path_to_unordered_id;
|
||||
|
||||
pub mod doc_id_mapping;
|
||||
pub(crate) mod doc_id_mapping;
|
||||
mod doc_opstamp_mapping;
|
||||
mod flat_map_with_buffer;
|
||||
pub(crate) mod index_writer;
|
||||
@@ -17,8 +17,7 @@ pub(crate) mod indexing_term;
|
||||
mod log_merge_policy;
|
||||
mod merge_operation;
|
||||
pub(crate) mod merge_policy;
|
||||
/// Segment merger: combines multiple segments into one.
|
||||
pub mod merger;
|
||||
pub(crate) mod merger;
|
||||
mod merger_sorted_index_test;
|
||||
pub(crate) mod operation;
|
||||
pub(crate) mod prepared_commit;
|
||||
@@ -34,19 +33,15 @@ mod stamper;
|
||||
use crossbeam_channel as channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::doc_id_mapping::SegmentDocIdMapping;
|
||||
pub use self::index_writer::{advance_deletes, IndexWriter, IndexWriterOptions};
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::merger::IndexMerger;
|
||||
pub use self::operation::{AddOperation, DeleteOperation, UserOperation};
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::SegmentEntry;
|
||||
pub(crate) use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_updater::{
|
||||
merge_filtered_segments, merge_indices, merge_segments_with_doc_id_mapping,
|
||||
};
|
||||
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::{Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta};
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
|
||||
use crate::indexer::index_writer::advance_deletes;
|
||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
@@ -256,82 +255,6 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
||||
Ok(merged_index)
|
||||
}
|
||||
|
||||
/// Like [`merge_filtered_segments`], but uses a caller-supplied [`SegmentDocIdMapping`]
|
||||
/// to control the final document order. The mapping should be built from the same
|
||||
/// segments (in the same order) passed here.
|
||||
///
|
||||
/// Use this to apply an external reordering during a merge without relying on a persistent fast field.
|
||||
///
|
||||
/// # Warning
|
||||
/// Same caveats as [`merge_filtered_segments`]: no live `IndexWriter` allowed.
|
||||
#[doc(hidden)]
|
||||
pub fn merge_segments_with_doc_id_mapping<T: Into<Box<dyn Directory>>>(
|
||||
segments: &[Segment],
|
||||
target_settings: IndexSettings,
|
||||
filter_doc_ids: Vec<Option<AliveBitSet>>,
|
||||
doc_id_mapping: SegmentDocIdMapping,
|
||||
output_directory: T,
|
||||
) -> crate::Result<Index> {
|
||||
if segments.is_empty() {
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"No segments given to merge".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let target_schema = segments[0].schema();
|
||||
|
||||
if segments
|
||||
.iter()
|
||||
.skip(1)
|
||||
.any(|seg| seg.schema() != target_schema)
|
||||
{
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"Attempt to merge different schema indices".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut merged_index = Index::create(
|
||||
output_directory,
|
||||
target_schema.clone(),
|
||||
target_settings.clone(),
|
||||
)?;
|
||||
let merged_segment = merged_index.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
segments,
|
||||
filter_doc_ids,
|
||||
)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
||||
let num_docs = merger.write_with_doc_id_mapping(segment_serializer, doc_id_mapping)?;
|
||||
|
||||
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
|
||||
|
||||
let stats = format!(
|
||||
"Segments Merge (external reordering): [{}]",
|
||||
segments
|
||||
.iter()
|
||||
.fold(String::new(), |sum, current| format!(
|
||||
"{sum}{} ",
|
||||
current.meta().id().uuid_string()
|
||||
))
|
||||
.trim_end()
|
||||
);
|
||||
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: target_settings,
|
||||
segments: vec![segment_meta],
|
||||
schema: target_schema,
|
||||
opstamp: 0u64,
|
||||
payload: Some(stats),
|
||||
};
|
||||
|
||||
save_metas(&index_meta, merged_index.directory_mut())?;
|
||||
|
||||
Ok(merged_index)
|
||||
}
|
||||
|
||||
pub(crate) struct InnerSegmentUpdater {
|
||||
// we keep a copy of the current active IndexMeta to
|
||||
// avoid loading the file every time we need it in the
|
||||
|
||||
@@ -229,10 +229,7 @@ pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
|
||||
Segment, SegmentMeta, SegmentReader,
|
||||
};
|
||||
pub use crate::indexer::{
|
||||
IndexMerger, IndexWriter, SegmentDocIdMapping, SingleSegmentIndexWriter,
|
||||
merge_segments_with_doc_id_mapping,
|
||||
};
|
||||
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
|
||||
/// Index format version.
|
||||
|
||||
@@ -9,8 +9,11 @@ const POSITION_END: u32 = 0;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct BufferLender {
|
||||
buffer_u8: Vec<u8>,
|
||||
buffer_u32: Vec<u32>,
|
||||
pub buffer_u8: Vec<u8>,
|
||||
pub buffer_u32: Vec<u32>,
|
||||
pub doc_id_and_tf: Vec<(u32, u32)>,
|
||||
pub buffer_positions_flat: Vec<u32>,
|
||||
pub doc_id_and_offsets: Vec<(u32, u32, u32)>,
|
||||
}
|
||||
|
||||
impl BufferLender {
|
||||
@@ -198,11 +201,14 @@ impl Recorder for TermFrequencyRecorder {
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(arena, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let mut doc_id_and_tf = vec![];
|
||||
buffer_lender.buffer_u8.clear();
|
||||
buffer_lender.doc_id_and_tf.clear();
|
||||
let buffer = &mut buffer_lender.buffer_u8;
|
||||
self.stack.read_to_end(arena, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
|
||||
let doc_id_and_tf = &mut buffer_lender.doc_id_and_tf;
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
@@ -212,10 +218,13 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
||||
|
||||
for (doc_id, tf) in doc_id_and_tf {
|
||||
for &(doc_id, tf) in doc_id_and_tf.iter() {
|
||||
serializer.write_doc(doc_id, tf, &[][..]);
|
||||
}
|
||||
} else {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(arena, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
@@ -272,42 +281,78 @@ impl Recorder for TfAndPositionRecorder {
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
self.stack.read_to_end(arena, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
let mut doc_id_and_positions = vec![];
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
let mut prev_position_plus_one = 1u32;
|
||||
buffer_positions.clear();
|
||||
loop {
|
||||
match u32_it.next() {
|
||||
Some(POSITION_END) | None => {
|
||||
break;
|
||||
}
|
||||
Some(position_plus_one) => {
|
||||
let delta_position = position_plus_one - prev_position_plus_one;
|
||||
buffer_positions.push(delta_position);
|
||||
prev_position_plus_one = position_plus_one;
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
buffer_lender.buffer_u8.clear();
|
||||
buffer_lender.buffer_positions_flat.clear();
|
||||
buffer_lender.doc_id_and_offsets.clear();
|
||||
|
||||
let buffer_u8 = &mut buffer_lender.buffer_u8;
|
||||
self.stack.read_to_end(arena, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
|
||||
let buffer_positions_flat = &mut buffer_lender.buffer_positions_flat;
|
||||
let doc_id_and_offsets = &mut buffer_lender.doc_id_and_offsets;
|
||||
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
|
||||
let start_offset = buffer_positions_flat.len() as u32;
|
||||
let mut prev_position_plus_one = 1u32;
|
||||
|
||||
loop {
|
||||
match u32_it.next() {
|
||||
Some(POSITION_END) | None => {
|
||||
break;
|
||||
}
|
||||
Some(position_plus_one) => {
|
||||
let delta_position = position_plus_one - prev_position_plus_one;
|
||||
buffer_positions_flat.push(delta_position);
|
||||
prev_position_plus_one = position_plus_one;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let end_offset = buffer_positions_flat.len() as u32;
|
||||
doc_id_and_offsets.push((
|
||||
doc_id_map.get_new_doc_id(doc_id),
|
||||
start_offset,
|
||||
end_offset,
|
||||
));
|
||||
}
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
// this simple variant to remap may consume to much memory
|
||||
doc_id_and_positions
|
||||
.push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
|
||||
} else {
|
||||
|
||||
doc_id_and_offsets.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
|
||||
for &(doc_id, start_offset, end_offset) in doc_id_and_offsets.iter() {
|
||||
let positions =
|
||||
&buffer_positions_flat[(start_offset as usize)..(end_offset as usize)];
|
||||
serializer.write_doc(doc_id, positions.len() as u32, positions);
|
||||
}
|
||||
} else {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
self.stack.read_to_end(arena, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
let mut prev_doc = 0;
|
||||
while let Some(delta_doc_id) = u32_it.next() {
|
||||
let doc_id = prev_doc + delta_doc_id;
|
||||
prev_doc = doc_id;
|
||||
let mut prev_position_plus_one = 1u32;
|
||||
buffer_positions.clear();
|
||||
loop {
|
||||
match u32_it.next() {
|
||||
Some(POSITION_END) | None => {
|
||||
break;
|
||||
}
|
||||
Some(position_plus_one) => {
|
||||
let delta_position = position_plus_one - prev_position_plus_one;
|
||||
buffer_positions.push(delta_position);
|
||||
prev_position_plus_one = position_plus_one;
|
||||
}
|
||||
}
|
||||
}
|
||||
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
|
||||
}
|
||||
}
|
||||
if doc_id_map.is_some() {
|
||||
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id);
|
||||
for (doc_id, positions) in doc_id_and_positions {
|
||||
serializer.write_doc(doc_id, positions.len() as u32, &positions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn term_doc_freq(&self) -> Option<u32> {
|
||||
|
||||
@@ -273,8 +273,14 @@ mod tests {
|
||||
}
|
||||
|
||||
if all_match {
|
||||
let score: Score =
|
||||
leader.score() + secondaries.iter_mut().map(|s| s.score()).sum::<Score>();
|
||||
// Accumulate in the same left-to-right order as the WAND implementation
|
||||
// (leader first, then each secondary in turn). Float addition is not
|
||||
// associative, so `leader + secondaries.sum()` gives a different bit
|
||||
// pattern and can cause spurious nearly_equals failures.
|
||||
let mut score: Score = leader.score();
|
||||
for secondary in secondaries.iter_mut() {
|
||||
score += secondary.score();
|
||||
}
|
||||
|
||||
if score > limit {
|
||||
heap.push(Float(score));
|
||||
@@ -417,6 +423,198 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_three_scorers_regression() {
|
||||
// Minimal failing case found by proptest (CI run 27557430583, job 81460063906).
|
||||
// Posting list 0 spans docs 0–63 (all present, doc 8 has tf=80, doc 26 tf=4, rest tf=1).
|
||||
// Posting lists 1 and 2 are sparse with varying term freqs, and doc 16/64 appear only
|
||||
// in lists 1/2 but not list 0. The high tf=80 on doc 8 of list 0 makes the WAND
|
||||
// upper-bound estimation skip documents that the naive intersection would score.
|
||||
let posting_lists: &[&[(DocId, u32)]] = &[
|
||||
&[
|
||||
(0, 1),
|
||||
(1, 1),
|
||||
(2, 1),
|
||||
(3, 1),
|
||||
(4, 1),
|
||||
(5, 1),
|
||||
(6, 1),
|
||||
(7, 1),
|
||||
(8, 80),
|
||||
(9, 1),
|
||||
(10, 1),
|
||||
(11, 1),
|
||||
(12, 1),
|
||||
(13, 1),
|
||||
(14, 1),
|
||||
(15, 1),
|
||||
(17, 1),
|
||||
(18, 1),
|
||||
(19, 1),
|
||||
(20, 1),
|
||||
(21, 1),
|
||||
(22, 1),
|
||||
(23, 1),
|
||||
(24, 1),
|
||||
(25, 1),
|
||||
(26, 4),
|
||||
(27, 1),
|
||||
(28, 1),
|
||||
(29, 1),
|
||||
(30, 1),
|
||||
(31, 1),
|
||||
(32, 1),
|
||||
(33, 1),
|
||||
(34, 1),
|
||||
(35, 1),
|
||||
(36, 1),
|
||||
(37, 1),
|
||||
(38, 1),
|
||||
(39, 1),
|
||||
(40, 1),
|
||||
(41, 1),
|
||||
(42, 1),
|
||||
(43, 1),
|
||||
(44, 1),
|
||||
(45, 1),
|
||||
(46, 1),
|
||||
(47, 1),
|
||||
(48, 1),
|
||||
(49, 1),
|
||||
(50, 1),
|
||||
(51, 1),
|
||||
(52, 1),
|
||||
(53, 1),
|
||||
(54, 1),
|
||||
(55, 1),
|
||||
(56, 1),
|
||||
(57, 1),
|
||||
(58, 1),
|
||||
(59, 1),
|
||||
(60, 1),
|
||||
(61, 1),
|
||||
(62, 1),
|
||||
(63, 1),
|
||||
],
|
||||
&[
|
||||
(0, 2),
|
||||
(3, 98),
|
||||
(7, 93),
|
||||
(8, 87),
|
||||
(9, 39),
|
||||
(10, 2),
|
||||
(12, 71),
|
||||
(14, 47),
|
||||
(15, 76),
|
||||
(16, 6),
|
||||
(17, 38),
|
||||
(19, 61),
|
||||
(20, 87),
|
||||
(21, 1),
|
||||
(22, 5),
|
||||
(23, 43),
|
||||
(25, 48),
|
||||
(26, 87),
|
||||
(28, 81),
|
||||
(29, 69),
|
||||
(30, 7),
|
||||
(31, 47),
|
||||
(32, 32),
|
||||
(33, 38),
|
||||
(35, 39),
|
||||
(38, 65),
|
||||
(39, 98),
|
||||
(42, 43),
|
||||
(43, 52),
|
||||
(44, 99),
|
||||
(45, 88),
|
||||
(48, 24),
|
||||
(51, 61),
|
||||
(52, 22),
|
||||
(53, 58),
|
||||
(55, 26),
|
||||
(56, 32),
|
||||
(58, 57),
|
||||
(60, 29),
|
||||
(61, 78),
|
||||
(62, 9),
|
||||
(63, 44),
|
||||
(64, 29),
|
||||
],
|
||||
&[
|
||||
(0, 94),
|
||||
(2, 49),
|
||||
(3, 63),
|
||||
(4, 7),
|
||||
(6, 93),
|
||||
(7, 17),
|
||||
(8, 91),
|
||||
(9, 18),
|
||||
(10, 85),
|
||||
(11, 11),
|
||||
(12, 45),
|
||||
(13, 42),
|
||||
(15, 91),
|
||||
(16, 44),
|
||||
(17, 36),
|
||||
(18, 68),
|
||||
(19, 24),
|
||||
(20, 17),
|
||||
(21, 59),
|
||||
(22, 97),
|
||||
(24, 20),
|
||||
(25, 7),
|
||||
(26, 85),
|
||||
(27, 69),
|
||||
(28, 78),
|
||||
(29, 84),
|
||||
(30, 35),
|
||||
(31, 49),
|
||||
(33, 83),
|
||||
(34, 97),
|
||||
(35, 29),
|
||||
(36, 43),
|
||||
(37, 59),
|
||||
(38, 79),
|
||||
(39, 74),
|
||||
(40, 21),
|
||||
(41, 5),
|
||||
(42, 47),
|
||||
(43, 27),
|
||||
(44, 59),
|
||||
(45, 97),
|
||||
(46, 91),
|
||||
(47, 81),
|
||||
(48, 57),
|
||||
(49, 47),
|
||||
(50, 64),
|
||||
(51, 86),
|
||||
(52, 60),
|
||||
(53, 52),
|
||||
(54, 14),
|
||||
(55, 23),
|
||||
(56, 64),
|
||||
(57, 40),
|
||||
(58, 5),
|
||||
(59, 30),
|
||||
(60, 81),
|
||||
(61, 62),
|
||||
(62, 39),
|
||||
(63, 93),
|
||||
(64, 82),
|
||||
],
|
||||
];
|
||||
let fieldnorms: &[u32] = &[
|
||||
624, 668, 725, 670, 851, 169, 537, 627, 200, 757, 51, 272, 835, 89, 750, 63, 272, 406,
|
||||
394, 390, 822, 449, 257, 571, 527, 855, 4, 98, 548, 413, 539, 351, 596, 151, 728, 152,
|
||||
766, 829, 20, 828, 477, 251, 743, 646, 136, 477, 909, 907, 266, 341, 676, 161, 40, 384,
|
||||
347, 707, 42, 397, 482, 814, 801, 528, 465, 410, 171,
|
||||
];
|
||||
let posting_lists_owned: Vec<Vec<(DocId, u32)>> =
|
||||
posting_lists.iter().map(|pl| pl.to_vec()).collect();
|
||||
test_block_wand_intersection_aux(&posting_lists_owned, fieldnorms);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_disjoint() {
|
||||
// Two posting lists with no overlap — intersection is empty.
|
||||
|
||||
Reference in New Issue
Block a user