From 4f8481a1e4ed2bb2335b317d470b1c545e69e1ef Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 21 May 2021 08:23:17 +0200 Subject: [PATCH] Detect if segments are stackackable with sorting, fixes #1038 (#1054) * Detect if segments are stackackable with sorting, fixes #1038 Detect if segments are stackable when their data ranges on the sort property are disjunct. Presort segments by thei min value on merge, to enable easier stacking. * move code to function --- src/core/index_meta.rs | 11 ++ src/indexer/merger.rs | 222 ++++++++++++++++++------ src/indexer/merger_sorted_index_test.rs | 97 ++++++++--- 3 files changed, 253 insertions(+), 77 deletions(-) diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 0a9397583..62493f23b 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -255,6 +255,17 @@ pub enum Order { /// Descending Order Desc, } +impl Order { + /// return if the Order is ascending + pub fn is_asc(&self) -> bool { + self == &Order::Asc + } + /// return if the Order is descending + pub fn is_desc(&self) -> bool { + self == &Order::Desc + } +} + /// Meta information about the `Index`. /// /// This object is serialized on disk in the `meta.json` file. diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 33c924635..b1ef2033c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -183,6 +183,10 @@ impl IndexMerger { readers.push(reader); } } + if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() { + readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?; + } + // sort segments by their natural sort setting if max_doc >= MAX_DOC_LIMIT { let err_msg = format!( "The segment resulting from this merge would have {} docs,\ @@ -199,6 +203,30 @@ impl IndexMerger { }) } + fn sort_readers_by_min_sort_field( + readers: Vec, + sort_by_field: &IndexSortByField, + ) -> crate::Result> { + // presort the readers by their min_values, so that when they are disjunct, we can use + // the regular merge logic (implicitly sorted) + let mut readers_with_min_sort_values = readers + .into_iter() + .map(|reader| { + let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?; + Ok((reader, accessor.min_value())) + }) + .collect::>>()?; + if sort_by_field.order.is_asc() { + readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val); + } else { + readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val)); + } + Ok(readers_with_min_sort_values + .into_iter() + .map(|(reader, _)| reader) + .collect()) + } + fn write_fieldnorms( &self, mut fieldnorms_serializer: FieldNormsSerializer, @@ -355,6 +383,55 @@ impl IndexMerger { } } + /// Checks if the readers are disjunct for their sort property and in the correct order to be + /// able to just stack them. + pub(crate) fn is_disjunct_and_sorted_on_sort_property( + &self, + sort_by_field: &IndexSortByField, + ) -> crate::Result { + let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?; + + let everything_is_in_order = reader_and_field_accessors + .into_iter() + .map(|reader| reader.1) + .tuple_windows() + .all(|(field_accessor1, field_accessor2)| { + if sort_by_field.order.is_asc() { + return field_accessor1.max_value() <= field_accessor2.min_value(); + } else { + return field_accessor1.min_value() >= field_accessor2.max_value(); + } + }); + Ok(everything_is_in_order) + } + + pub(crate) fn get_sort_field_accessor<'a, 'b>( + reader: &SegmentReader, + sort_by_field: &'b IndexSortByField, + ) -> crate::Result> { + let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required + let value_accessor = reader.fast_fields().u64_lenient(field_id)?; + Ok(value_accessor) + } + /// Collecting value_accessors into a vec to bind the lifetime. + pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>( + &'a self, + sort_by_field: &'b IndexSortByField, + ) -> crate::Result, FastFieldReader)>> { + let reader_and_field_accessors = self + .readers + .iter() + .enumerate() + .map(Into::into) + .map(|reader_with_ordinal: SegmentReaderWithOrdinal| { + let value_accessor = + Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?; + Ok((reader_with_ordinal, value_accessor)) + }) + .collect::>>()?; + Ok(reader_and_field_accessors) + } + /// Generates the doc_id mapping where position in the vec=new /// doc_id. /// ReaderWithOrdinal will include the ordinal position of the @@ -363,24 +440,8 @@ impl IndexMerger { &self, sort_by_field: &IndexSortByField, ) -> crate::Result> { - let reader_and_field_accessors = self - .readers - .iter() - .enumerate() - .map(|reader| { - let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into(); - let field_id = expect_field_id_for_sort_field( - &reader_with_ordinal.reader.schema(), - &sort_by_field, - )?; // for now expect fastfield, but not strictly required - let value_accessor = reader_with_ordinal - .reader - .fast_fields() - .u64_lenient(field_id)?; - Ok((reader_with_ordinal, value_accessor)) - }) - .collect::>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference. - // Loading the field accessor on demand causes a 15x regression + let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?; + // Loading the field accessor on demand causes a 15x regression // create iterators over segment/sort_accessor/doc_id tuple let doc_id_reader_pair = reader_and_field_accessors @@ -965,7 +1026,13 @@ impl SerializableSegment for IndexMerger { ) -> crate::Result { let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() { - Some(self.generate_doc_id_mapping(sort_by_field)?) + // If the documents are already sorted and stackable, we ignore the mapping and execute + // it as if there was no sorting + if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? { + None + } else { + Some(self.generate_doc_id_mapping(sort_by_field)?) + } } else { None }; @@ -1477,31 +1544,61 @@ mod tests { } #[test] fn test_merge_facets_sort_none() { - test_merge_facets(None) + test_merge_facets(None, true) } #[test] fn test_merge_facets_sort_asc() { - // the data is already sorted asc, so this should have no effect, but go through the docid - // mapping code - test_merge_facets(Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Asc, + // In the merge case this will go through the docid mapping code + test_merge_facets( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Desc, + }), }), - })); + true, + ); + // In the merge case this will not go through the docid mapping code, because the data is + // sorted and disjunct + test_merge_facets( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Desc, + }), + }), + false, + ); } #[test] fn test_merge_facets_sort_desc() { - test_merge_facets(Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, + // In the merge case this will go through the docid mapping code + test_merge_facets( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Desc, + }), }), - })); + true, + ); + // In the merge case this will not go through the docid mapping code, because the data is + // sorted and disjunct + test_merge_facets( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Desc, + }), + }), + false, + ); } - fn test_merge_facets(index_settings: Option) { + // force_segment_value_overlap forces the int value for sorting to have overlapping min and max + // ranges between segments so that merge algorithm can't apply certain optimizations + fn test_merge_facets(index_settings: Option, force_segment_value_overlap: bool) { let mut schema_builder = schema::Schema::builder(); let facet_field = schema_builder.add_facet_field("facet", INDEXED); let int_options = IntOptions::default() @@ -1518,32 +1615,47 @@ mod tests { let mut int_val = 0; { let mut index_writer = index.writer_for_tests().unwrap(); - let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { - let mut doc = Document::default(); - for facet in doc_facets { - doc.add_facet(facet_field, Facet::from(facet)); - } - doc.add_u64(int_field, int_val); - int_val += 1; - index_writer.add_document(doc); - }; + let index_doc = + |index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| { + let mut doc = Document::default(); + for facet in doc_facets { + doc.add_facet(facet_field, Facet::from(facet)); + } + doc.add_u64(int_field, *int_val); + *int_val += 1; + index_writer.add_document(doc); + }; - index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]); - index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]); - index_doc(&mut index_writer, &["/top/a", "/top/b"]); - index_doc(&mut index_writer, &["/top/a"]); + index_doc( + &mut index_writer, + &["/top/a/firstdoc", "/top/b"], + &mut int_val, + ); + index_doc( + &mut index_writer, + &["/top/a/firstdoc", "/top/b", "/top/c"], + &mut int_val, + ); + index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val); + index_doc(&mut index_writer, &["/top/a"], &mut int_val); - index_doc(&mut index_writer, &["/top/b", "/top/d"]); - index_doc(&mut index_writer, &["/top/d"]); - index_doc(&mut index_writer, &["/top/e"]); + index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val); + if force_segment_value_overlap { + index_doc(&mut index_writer, &["/top/d"], &mut 0); + index_doc(&mut index_writer, &["/top/e"], &mut 10); + index_writer.commit().expect("committed"); + index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges + } else { + index_doc(&mut index_writer, &["/top/d"], &mut int_val); + index_doc(&mut index_writer, &["/top/e"], &mut int_val); + index_writer.commit().expect("committed"); + index_doc(&mut index_writer, &["/top/a"], &mut int_val); + } + index_doc(&mut index_writer, &["/top/b"], &mut int_val); + index_doc(&mut index_writer, &["/top/c"], &mut int_val); index_writer.commit().expect("committed"); - index_doc(&mut index_writer, &["/top/a"]); - index_doc(&mut index_writer, &["/top/b"]); - index_doc(&mut index_writer, &["/top/c"]); - index_writer.commit().expect("committed"); - - index_doc(&mut index_writer, &["/top/e", "/top/f"]); + index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val); index_writer.commit().expect("committed"); } diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 24fcf6c2a..a02adde46 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -39,6 +39,7 @@ mod tests { let mut index_writer = index.writer_for_tests().unwrap(); index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime"))); + index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime"))); assert!(index_writer.commit().is_ok()); index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta"))); @@ -58,7 +59,12 @@ mod tests { index } - fn create_test_index(index_settings: Option) -> Index { + // force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct + // ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500] + fn create_test_index( + index_settings: Option, + force_disjunct_segment_sort_values: bool, + ) -> Index { let mut schema_builder = schema::Schema::builder(); let int_options = IntOptions::default() .set_fast(Cardinality::SingleValue) @@ -92,6 +98,7 @@ mod tests { { let mut index_writer = index.writer_for_tests().unwrap(); + // segment 1 - range 1-3 index_writer.add_document(doc!(int_field=>1_u64)); index_writer.add_document( doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")), @@ -102,13 +109,26 @@ mod tests { ); assert!(index_writer.commit().is_ok()); + // segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20 index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64)); - index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime"))); + + let in_val = if force_disjunct_segment_sort_values { + 10_u64 + } else { + 1 + }; + index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime"))); assert!(index_writer.commit().is_ok()); - index_writer.add_document( - doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), + // segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000 + let int_vals = if force_disjunct_segment_sort_values { + [100_u64, 50] + } else { + [10, 5] + }; + index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1] + doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), ); - index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme")); + index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme")); index_writer.add_document( doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num") ); @@ -140,13 +160,24 @@ mod tests { } #[test] - fn test_merge_sorted_index_desc() { - let index = create_test_index(Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, + fn test_merge_sorted_index_desc_not_disjunct() { + test_merge_sorted_index_desc_(false); + } + #[test] + fn test_merge_sorted_index_desc_disjunct() { + test_merge_sorted_index_desc_(true); + } + + fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) { + let index = create_test_index( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Desc, + }), }), - })); + force_disjunct_segment_sort_values, + ); let int_field = index.schema().get_field("intval").unwrap(); let reader = index.reader().unwrap(); @@ -160,8 +191,13 @@ mod tests { assert_eq!(fast_field.get(5u32), 1u64); assert_eq!(fast_field.get(4u32), 2u64); assert_eq!(fast_field.get(3u32), 3u64); - assert_eq!(fast_field.get(2u32), 10u64); - assert_eq!(fast_field.get(1u32), 20u64); + if force_disjunct_segment_sort_values { + assert_eq!(fast_field.get(2u32), 20u64); + assert_eq!(fast_field.get(1u32), 100u64); + } else { + assert_eq!(fast_field.get(2u32), 10u64); + assert_eq!(fast_field.get(1u32), 20u64); + } assert_eq!(fast_field.get(0u32), 1_000u64); // test new field norm mapping @@ -169,8 +205,13 @@ mod tests { let my_text_field = index.schema().get_field("text_field").unwrap(); let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap(); assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber + if force_disjunct_segment_sort_values { + assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber + assert_eq!(fieldnorm_reader.fieldnorm(2), 0); + } else { + assert_eq!(fieldnorm_reader.fieldnorm(1), 0); + assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber + } assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text assert_eq!(fieldnorm_reader.fieldnorm(5), 0); } @@ -191,13 +232,22 @@ mod tests { }; assert_eq!(do_search("some"), vec![3]); - assert_eq!(do_search("blubber"), vec![2]); + if force_disjunct_segment_sort_values { + assert_eq!(do_search("blubber"), vec![1]); + } else { + assert_eq!(do_search("blubber"), vec![2]); + } assert_eq!(do_search("biggest"), vec![0]); } // access doc store { - let doc = searcher.doc(DocAddress::new(0, 2)).unwrap(); + let blubber_pos = if force_disjunct_segment_sort_values { + 1 + } else { + 2 + }; + let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap(); assert_eq!( doc.get_first(my_text_field).unwrap().text(), Some("blubber") @@ -209,12 +259,15 @@ mod tests { #[test] fn test_merge_sorted_index_asc() { - let index = create_test_index(Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Asc, + let index = create_test_index( + Some(IndexSettings { + sort_by_field: Some(IndexSortByField { + field: "intval".to_string(), + order: Order::Asc, + }), }), - })); + false, + ); let int_field = index.schema().get_field("intval").unwrap(); let multi_numbers = index.schema().get_field("multi_numbers").unwrap();