adjust Dictionary::sorted_ords_to_term_cb() to allow duplicates

This commit is contained in:
Eric Ridge
2024-10-28 14:45:09 -04:00
committed by Stu Hood
parent 4e84c70387
commit 2f01152a3c
2 changed files with 84 additions and 9 deletions

View File

@@ -1529,6 +1529,72 @@ mod tests {
Ok(())
}
proptest! {
#[test]
fn test_top_field_collect_string_prop(
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
limit in 1..256_usize,
offset in 0..256_usize,
segments_terms in
proptest::collection::vec(
proptest::collection::vec(0..32_u8, 1..32_usize),
0..8_usize,
)
) {
let mut schema_builder = Schema::builder();
let city = schema_builder.add_text_field("city", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
// A Vec<Vec<u8>>, where the outer Vec represents segments, and the inner Vec
// represents terms.
for segment_terms in segments_terms.into_iter() {
for term in segment_terms.into_iter() {
let term = format!("{term:0>3}");
index_writer.add_document(doc!(
city => term,
))?;
}
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let top_n_results = searcher.search(&AllQuery, &TopDocs::with_limit(limit)
.and_offset(offset)
.order_by_string_fast_field("city", order.clone()))?;
let all_results = searcher.search(&AllQuery, &DocSetCollector)?.into_iter().map(|doc_address| {
// Get the term for this address.
// NOTE: We can't determine the SegmentIds that will be generated for Segments
// ahead of time, so we can't pre-compute the expected `DocAddress`es.
let column = searcher.segment_readers()[doc_address.segment_ord as usize].fast_fields().str("city").unwrap().unwrap();
let term_ord = column.term_ords(doc_address.doc_id).next().unwrap();
let mut city = Vec::new();
column.dictionary().ord_to_term(term_ord, &mut city).unwrap();
(String::try_from(city).unwrap(), doc_address)
});
// Using the TopDocs collector should always be equivalent to sorting, skipping the
// offset, and then taking the limit.
let sorted_docs: Vec<_> = if order.is_desc() {
let mut comparable_docs: Vec<ComparableDoc<_, _, true>> =
all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect();
comparable_docs.sort();
comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect()
} else {
let mut comparable_docs: Vec<ComparableDoc<_, _, false>> =
all_results.into_iter().map(|(feature, doc)| ComparableDoc { feature, doc}).collect();
comparable_docs.sort();
comparable_docs.into_iter().map(|cd| (cd.feature, cd.doc)).collect()
};
let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::<Vec<_>>();
prop_assert_eq!(
expected_docs,
top_n_results
);
}
}
#[test]
#[should_panic]
fn test_field_does_not_exist() {

View File

@@ -515,7 +515,13 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
let mut current_sstable_delta_reader =
self.sstable_delta_reader_block(current_block_addr.clone())?;
let mut current_ordinal = 0;
let mut prev_ord = None;
for ord in ord {
// only advance forward if the new ord is different than the one we just processed
//
// this allows the input TermOrdinal iterator to contain duplicates, so long as it's
// still sorted
if Some(ord) != prev_ord {
assert!(ord >= current_ordinal);
// check if block changed for new term_ord
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
@@ -527,6 +533,9 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
bytes.clear();
}
prev_ord = Some(ord);
}
// move to ord inside that block
for _ in current_ordinal..=ord {
if !current_sstable_delta_reader.advance()? {