Detect if segments are stackackable with sorting, fixes #1038 (#1054)

* Detect if segments are stackackable with sorting, fixes #1038

Detect if segments are stackable when their data ranges on the sort property are disjunct.
Presort segments by thei min value on merge, to enable easier stacking.

* move code to function
This commit is contained in:
PSeitz
2021-05-21 08:23:17 +02:00
committed by GitHub
parent bcd72e5c14
commit 4f8481a1e4
3 changed files with 253 additions and 77 deletions

View File

@@ -255,6 +255,17 @@ pub enum Order {
/// Descending Order
Desc,
}
impl Order {
/// return if the Order is ascending
pub fn is_asc(&self) -> bool {
self == &Order::Asc
}
/// return if the Order is descending
pub fn is_desc(&self) -> bool {
self == &Order::Desc
}
}
/// Meta information about the `Index`.
///
/// This object is serialized on disk in the `meta.json` file.

View File

@@ -183,6 +183,10 @@ impl IndexMerger {
readers.push(reader);
}
}
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
}
// sort segments by their natural sort setting
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
@@ -199,6 +203,30 @@ impl IndexMerger {
})
}
fn sort_readers_by_min_sort_field(
readers: Vec<SegmentReader>,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<SegmentReader>> {
// presort the readers by their min_values, so that when they are disjunct, we can use
// the regular merge logic (implicitly sorted)
let mut readers_with_min_sort_values = readers
.into_iter()
.map(|reader| {
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
Ok((reader, accessor.min_value()))
})
.collect::<crate::Result<Vec<_>>>()?;
if sort_by_field.order.is_asc() {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
} else {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
}
Ok(readers_with_min_sort_values
.into_iter()
.map(|(reader, _)| reader)
.collect())
}
fn write_fieldnorms(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
@@ -355,6 +383,55 @@ impl IndexMerger {
}
}
/// Checks if the readers are disjunct for their sort property and in the correct order to be
/// able to just stack them.
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<bool> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
let everything_is_in_order = reader_and_field_accessors
.into_iter()
.map(|reader| reader.1)
.tuple_windows()
.all(|(field_accessor1, field_accessor2)| {
if sort_by_field.order.is_asc() {
return field_accessor1.max_value() <= field_accessor2.min_value();
} else {
return field_accessor1.min_value() >= field_accessor2.max_value();
}
});
Ok(everything_is_in_order)
}
pub(crate) fn get_sort_field_accessor<'a, 'b>(
reader: &SegmentReader,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
}
/// Collecting value_accessors into a vec to bind the lifetime.
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
&'a self,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<Vec<(SegmentReaderWithOrdinal<'a>, FastFieldReader<u64>)>> {
let reader_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(Into::into)
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
let value_accessor =
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
Ok((reader_with_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?;
Ok(reader_and_field_accessors)
}
/// Generates the doc_id mapping where position in the vec=new
/// doc_id.
/// ReaderWithOrdinal will include the ordinal position of the
@@ -363,24 +440,8 @@ impl IndexMerger {
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
let reader_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(|reader| {
let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into();
let field_id = expect_field_id_for_sort_field(
&reader_with_ordinal.reader.schema(),
&sort_by_field,
)?; // for now expect fastfield, but not strictly required
let value_accessor = reader_with_ordinal
.reader
.fast_fields()
.u64_lenient(field_id)?;
Ok((reader_with_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference.
// Loading the field accessor on demand causes a 15x regression
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair = reader_and_field_accessors
@@ -965,7 +1026,13 @@ impl SerializableSegment for IndexMerger {
) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
Some(self.generate_doc_id_mapping(sort_by_field)?)
// If the documents are already sorted and stackable, we ignore the mapping and execute
// it as if there was no sorting
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
None
} else {
Some(self.generate_doc_id_mapping(sort_by_field)?)
}
} else {
None
};
@@ -1477,31 +1544,61 @@ mod tests {
}
#[test]
fn test_merge_facets_sort_none() {
test_merge_facets(None)
test_merge_facets(None, true)
}
#[test]
fn test_merge_facets_sort_asc() {
// the data is already sorted asc, so this should have no effect, but go through the docid
// mapping code
test_merge_facets(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
// In the merge case this will go through the docid mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
}),
}));
true,
);
// In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
}),
false,
);
}
#[test]
fn test_merge_facets_sort_desc() {
test_merge_facets(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
// In the merge case this will go through the docid mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
}),
}));
true,
);
// In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
}),
false,
);
}
fn test_merge_facets(index_settings: Option<IndexSettings>) {
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
// ranges between segments so that merge algorithm can't apply certain optimizations
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let int_options = IntOptions::default()
@@ -1518,32 +1615,47 @@ mod tests {
let mut int_val = 0;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
doc.add_u64(int_field, int_val);
int_val += 1;
index_writer.add_document(doc);
};
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
doc.add_u64(int_field, *int_val);
*int_val += 1;
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
index_doc(&mut index_writer, &["/top/a"]);
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b"],
&mut int_val,
);
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b", "/top/c"],
&mut int_val,
);
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
index_doc(&mut index_writer, &["/top/d"]);
index_doc(&mut index_writer, &["/top/e"]);
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
if force_segment_value_overlap {
index_doc(&mut index_writer, &["/top/d"], &mut 0);
index_doc(&mut index_writer, &["/top/e"], &mut 10);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
} else {
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
}
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b"]);
index_doc(&mut index_writer, &["/top/c"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
index_writer.commit().expect("committed");
}

View File

@@ -39,6 +39,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
@@ -58,7 +59,12 @@ mod tests {
index
}
fn create_test_index(index_settings: Option<IndexSettings>) -> Index {
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
fn create_test_index(
index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool,
) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue)
@@ -92,6 +98,7 @@ mod tests {
{
let mut index_writer = index.writer_for_tests().unwrap();
// segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64));
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
@@ -102,13 +109,26 @@ mod tests {
);
assert!(index_writer.commit().is_ok());
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
let in_val = if force_disjunct_segment_sort_values {
10_u64
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(
doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50]
} else {
[10, 5]
};
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
);
index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme"));
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
);
@@ -140,13 +160,24 @@ mod tests {
}
#[test]
fn test_merge_sorted_index_desc() {
let index = create_test_index(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
fn test_merge_sorted_index_desc_not_disjunct() {
test_merge_sorted_index_desc_(false);
}
#[test]
fn test_merge_sorted_index_desc_disjunct() {
test_merge_sorted_index_desc_(true);
}
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
}),
}));
force_disjunct_segment_sort_values,
);
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
@@ -160,8 +191,13 @@ mod tests {
assert_eq!(fast_field.get(5u32), 1u64);
assert_eq!(fast_field.get(4u32), 2u64);
assert_eq!(fast_field.get(3u32), 3u64);
assert_eq!(fast_field.get(2u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get(2u32), 20u64);
assert_eq!(fast_field.get(1u32), 100u64);
} else {
assert_eq!(fast_field.get(2u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
}
assert_eq!(fast_field.get(0u32), 1_000u64);
// test new field norm mapping
@@ -169,8 +205,13 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
if force_disjunct_segment_sort_values {
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
} else {
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
}
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
}
@@ -191,13 +232,22 @@ mod tests {
};
assert_eq!(do_search("some"), vec![3]);
assert_eq!(do_search("blubber"), vec![2]);
if force_disjunct_segment_sort_values {
assert_eq!(do_search("blubber"), vec![1]);
} else {
assert_eq!(do_search("blubber"), vec![2]);
}
assert_eq!(do_search("biggest"), vec![0]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
let blubber_pos = if force_disjunct_segment_sort_values {
1
} else {
2
};
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().text(),
Some("blubber")
@@ -209,12 +259,15 @@ mod tests {
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
}),
}));
false,
);
let int_field = index.schema().get_field("intval").unwrap();
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();