diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 908126386..1171f81e6 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1529,6 +1529,72 @@ mod tests { Ok(()) } + proptest! { + #[test] + fn test_top_field_collect_string_prop( + order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)), + limit in 1..256_usize, + offset in 0..256_usize, + segments_terms in + proptest::collection::vec( + proptest::collection::vec(0..32_u8, 1..32_usize), + 0..8_usize, + ) + ) { + let mut schema_builder = Schema::builder(); + let city = schema_builder.add_text_field("city", TEXT | FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + + // A Vec>, where the outer Vec represents segments, and the inner Vec + // represents terms. + for segment_terms in segments_terms.into_iter() { + for term in segment_terms.into_iter() { + let term = format!("{term:0>3}"); + index_writer.add_document(doc!( + city => term, + ))?; + } + index_writer.commit()?; + } + + let searcher = index.reader()?.searcher(); + let top_n_results = searcher.search(&AllQuery, &TopDocs::with_limit(limit) + .and_offset(offset) + .order_by_string_fast_field("city", order.clone()))?; + let all_results = searcher.search(&AllQuery, &DocSetCollector)?.into_iter().map(|doc_address| { + // Get the term for this address. + // NOTE: We can't determine the SegmentIds that will be generated for Segments + // ahead of time, so we can't pre-compute the expected `DocAddress`es. + let column = searcher.segment_readers()[doc_address.segment_ord as usize].fast_fields().str("city").unwrap().unwrap(); + let term_ord = column.term_ords(doc_address.doc_id).next().unwrap(); + let mut city = Vec::new(); + column.dictionary().ord_to_term(term_ord, &mut city).unwrap(); + (String::try_from(city).unwrap(), doc_address) + }); + + // Using the TopDocs collector should always be equivalent to sorting, skipping the + // offset, and then taking the limit. + let sorted_docs: Vec<_> = if order.is_desc() { + let mut comparable_docs: Vec> = + all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect(); + comparable_docs.sort(); + comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect() + } else { + let mut comparable_docs: Vec> = + all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect(); + comparable_docs.sort(); + comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect() + }; + let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::>(); + prop_assert_eq!( + expected_docs, + top_n_results + ); + } + } + #[test] #[should_panic] fn test_field_does_not_exist() { diff --git a/sstable/src/dictionary.rs b/sstable/src/dictionary.rs index caa7a6bb8..613aec189 100644 --- a/sstable/src/dictionary.rs +++ b/sstable/src/dictionary.rs @@ -515,16 +515,25 @@ impl Dictionary { let mut current_sstable_delta_reader = self.sstable_delta_reader_block(current_block_addr.clone())?; let mut current_ordinal = 0; + let mut prev_ord = None; for ord in ord { - assert!(ord >= current_ordinal); - // check if block changed for new term_ord - let new_block_addr = self.sstable_index.get_block_with_ord(ord); - if new_block_addr != current_block_addr { - current_block_addr = new_block_addr; - current_ordinal = current_block_addr.first_ordinal; - current_sstable_delta_reader = - self.sstable_delta_reader_block(current_block_addr.clone())?; - bytes.clear(); + // only advance forward if the new ord is different than the one we just processed + // + // this allows the input TermOrdinal iterator to contain duplicates, so long as it's + // still sorted + if Some(ord) != prev_ord { + assert!(ord >= current_ordinal); + // check if block changed for new term_ord + let new_block_addr = self.sstable_index.get_block_with_ord(ord); + if new_block_addr != current_block_addr { + current_block_addr = new_block_addr; + current_ordinal = current_block_addr.first_ordinal; + current_sstable_delta_reader = + self.sstable_delta_reader_block(current_block_addr.clone())?; + bytes.clear(); + } + + prev_ord = Some(ord); } // move to ord inside that block