mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
* Detect if segments are stackackable with sorting, fixes #1038 Detect if segments are stackable when their data ranges on the sort property are disjunct. Presort segments by thei min value on merge, to enable easier stacking. * move code to function
This commit is contained in:
@@ -255,6 +255,17 @@ pub enum Order {
|
||||
/// Descending Order
|
||||
Desc,
|
||||
}
|
||||
impl Order {
|
||||
/// return if the Order is ascending
|
||||
pub fn is_asc(&self) -> bool {
|
||||
self == &Order::Asc
|
||||
}
|
||||
/// return if the Order is descending
|
||||
pub fn is_desc(&self) -> bool {
|
||||
self == &Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
|
||||
@@ -183,6 +183,10 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
||||
}
|
||||
// sort segments by their natural sort setting
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
@@ -199,6 +203,30 @@ impl IndexMerger {
|
||||
})
|
||||
}
|
||||
|
||||
fn sort_readers_by_min_sort_field(
|
||||
readers: Vec<SegmentReader>,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<SegmentReader>> {
|
||||
// presort the readers by their min_values, so that when they are disjunct, we can use
|
||||
// the regular merge logic (implicitly sorted)
|
||||
let mut readers_with_min_sort_values = readers
|
||||
.into_iter()
|
||||
.map(|reader| {
|
||||
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
|
||||
Ok((reader, accessor.min_value()))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
if sort_by_field.order.is_asc() {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
|
||||
} else {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
|
||||
}
|
||||
Ok(readers_with_min_sort_values
|
||||
.into_iter()
|
||||
.map(|(reader, _)| reader)
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn write_fieldnorms(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
@@ -355,6 +383,55 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the readers are disjunct for their sort property and in the correct order to be
|
||||
/// able to just stack them.
|
||||
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<bool> {
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
|
||||
let everything_is_in_order = reader_and_field_accessors
|
||||
.into_iter()
|
||||
.map(|reader| reader.1)
|
||||
.tuple_windows()
|
||||
.all(|(field_accessor1, field_accessor2)| {
|
||||
if sort_by_field.order.is_asc() {
|
||||
return field_accessor1.max_value() <= field_accessor2.min_value();
|
||||
} else {
|
||||
return field_accessor1.min_value() >= field_accessor2.max_value();
|
||||
}
|
||||
});
|
||||
Ok(everything_is_in_order)
|
||||
}
|
||||
|
||||
pub(crate) fn get_sort_field_accessor<'a, 'b>(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &'b IndexSortByField,
|
||||
) -> crate::Result<FastFieldReader<u64>> {
|
||||
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
}
|
||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
|
||||
&'a self,
|
||||
sort_by_field: &'b IndexSortByField,
|
||||
) -> crate::Result<Vec<(SegmentReaderWithOrdinal<'a>, FastFieldReader<u64>)>> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(Into::into)
|
||||
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
|
||||
let value_accessor =
|
||||
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
Ok(reader_and_field_accessors)
|
||||
}
|
||||
|
||||
/// Generates the doc_id mapping where position in the vec=new
|
||||
/// doc_id.
|
||||
/// ReaderWithOrdinal will include the ordinal position of the
|
||||
@@ -363,24 +440,8 @@ impl IndexMerger {
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|reader| {
|
||||
let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into();
|
||||
let field_id = expect_field_id_for_sort_field(
|
||||
&reader_with_ordinal.reader.schema(),
|
||||
&sort_by_field,
|
||||
)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader_with_ordinal
|
||||
.reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field_id)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference.
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
|
||||
// create iterators over segment/sort_accessor/doc_id tuple
|
||||
let doc_id_reader_pair = reader_and_field_accessors
|
||||
@@ -965,7 +1026,13 @@ impl SerializableSegment for IndexMerger {
|
||||
) -> crate::Result<u32> {
|
||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
||||
{
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
// If the documents are already sorted and stackable, we ignore the mapping and execute
|
||||
// it as if there was no sorting
|
||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
||||
None
|
||||
} else {
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -1477,31 +1544,61 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_facets_sort_none() {
|
||||
test_merge_facets(None)
|
||||
test_merge_facets(None, true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_asc() {
|
||||
// the data is already sorted asc, so this should have no effect, but go through the docid
|
||||
// mapping code
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_desc() {
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>) {
|
||||
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let int_options = IntOptions::default()
|
||||
@@ -1518,32 +1615,47 @@ mod tests {
|
||||
let mut int_val = 0;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, int_val);
|
||||
int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
let index_doc =
|
||||
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, *int_val);
|
||||
*int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b", "/top/c"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/e"]);
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
|
||||
if force_segment_value_overlap {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut 0);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut 10);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
|
||||
} else {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
}
|
||||
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(&mut index_writer, &["/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/c"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
|
||||
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
|
||||
@@ -58,7 +59,12 @@ mod tests {
|
||||
index
|
||||
}
|
||||
|
||||
fn create_test_index(index_settings: Option<IndexSettings>) -> Index {
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
|
||||
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
@@ -92,6 +98,7 @@ mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
@@ -102,13 +109,26 @@ mod tests {
|
||||
);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
|
||||
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
|
||||
let in_val = if force_disjunct_segment_sort_values {
|
||||
10_u64
|
||||
} else {
|
||||
1
|
||||
};
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||
let int_vals = if force_disjunct_segment_sort_values {
|
||||
[100_u64, 50]
|
||||
} else {
|
||||
[10, 5]
|
||||
};
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
);
|
||||
index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme"));
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
);
|
||||
@@ -140,13 +160,24 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
fn test_merge_sorted_index_desc_not_disjunct() {
|
||||
test_merge_sorted_index_desc_(false);
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc_disjunct() {
|
||||
test_merge_sorted_index_desc_(true);
|
||||
}
|
||||
|
||||
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
}),
|
||||
}));
|
||||
force_disjunct_segment_sort_values,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -160,8 +191,13 @@ mod tests {
|
||||
assert_eq!(fast_field.get(5u32), 1u64);
|
||||
assert_eq!(fast_field.get(4u32), 2u64);
|
||||
assert_eq!(fast_field.get(3u32), 3u64);
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.get(2u32), 20u64);
|
||||
assert_eq!(fast_field.get(1u32), 100u64);
|
||||
} else {
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
}
|
||||
assert_eq!(fast_field.get(0u32), 1_000u64);
|
||||
|
||||
// test new field norm mapping
|
||||
@@ -169,8 +205,13 @@ mod tests {
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
} else {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
}
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
|
||||
}
|
||||
@@ -191,13 +232,22 @@ mod tests {
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![3]);
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(do_search("blubber"), vec![1]);
|
||||
} else {
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
}
|
||||
assert_eq!(do_search("biggest"), vec![0]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
};
|
||||
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().text(),
|
||||
Some("blubber")
|
||||
@@ -209,12 +259,15 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_asc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
}),
|
||||
}));
|
||||
false,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
|
||||
Reference in New Issue
Block a user