From 8a71e00da389b3bd6d3f7a0253a347ac3ef57a5a Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Mon, 27 Feb 2023 10:44:08 +0100 Subject: [PATCH] allow limiting the number of matched term in range query (#1899) --- .../phrase_prefix_query.rs | 7 ++- src/query/query_parser/query_parser.rs | 2 +- src/query/range_query/range_query.rs | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs index f00e8e50f..0c24c9312 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_query.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -147,7 +147,7 @@ impl Query for PhrasePrefixQuery { Bound::Unbounded }; - RangeQuery::new_term_bounds( + let mut range_query = RangeQuery::new_term_bounds( enable_scoring .schema() .get_field_name(self.field) @@ -155,8 +155,9 @@ impl Query for PhrasePrefixQuery { self.prefix.1.typ(), &Bound::Included(self.prefix.1.clone()), &end_term, - ) - .weight(enable_scoring) + ); + range_query.limit(self.max_expansions as u64); + range_query.weight(enable_scoring) } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 55e238246..4a29ff3c6 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -974,7 +974,7 @@ mod test { assert_eq!( format!("{:?}", query), "RangeQuery { field: \"title\", value_type: Str, left_bound: Included([97]), \ - right_bound: Included([98]) }" + right_bound: Included([98]), limit: None }" ); } diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index c181e840a..83bf6ae8e 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -70,6 +70,7 @@ pub struct RangeQuery { value_type: Type, left_bound: Bound>, right_bound: Bound>, + limit: Option, } impl RangeQuery { @@ -89,6 +90,7 @@ impl RangeQuery { value_type, left_bound: map_bound(left_bound, &verify_and_unwrap_term), right_bound: map_bound(right_bound, &verify_and_unwrap_term), + limit: None, } } @@ -126,6 +128,7 @@ impl RangeQuery { value_type: Type::I64, left_bound: map_bound(&left_bound, &make_term_val), right_bound: map_bound(&right_bound, &make_term_val), + limit: None, } } @@ -163,6 +166,7 @@ impl RangeQuery { value_type: Type::F64, left_bound: map_bound(&left_bound, &make_term_val), right_bound: map_bound(&right_bound, &make_term_val), + limit: None, } } @@ -188,6 +192,7 @@ impl RangeQuery { value_type: Type::U64, left_bound: map_bound(&left_bound, &make_term_val), right_bound: map_bound(&right_bound, &make_term_val), + limit: None, } } @@ -225,6 +230,7 @@ impl RangeQuery { value_type: Type::Date, left_bound: map_bound(&left_bound, &make_term_val), right_bound: map_bound(&right_bound, &make_term_val), + limit: None, } } @@ -254,6 +260,7 @@ impl RangeQuery { value_type: Type::Str, left_bound: map_bound(&left, &make_term_val), right_bound: map_bound(&right, &make_term_val), + limit: None, } } @@ -273,6 +280,14 @@ impl RangeQuery { pub fn field(&self) -> &str { &self.field } + + /// Limit the number of term the `RangeQuery` will go through. + /// + /// This does not limit the number of matching document, only the number of + /// different terms that get matched. + pub(crate) fn limit(&mut self, limit: u64) { + self.limit = Some(limit); + } } /// Returns true if the type maps to a u64 fast field @@ -327,6 +342,7 @@ impl Query for RangeQuery { field: self.field.to_string(), left_bound: self.left_bound.clone(), right_bound: self.right_bound.clone(), + limit: self.limit, })) } } @@ -336,6 +352,7 @@ pub struct RangeWeight { field: String, left_bound: Bound>, right_bound: Bound>, + limit: Option, } impl RangeWeight { @@ -352,6 +369,10 @@ impl RangeWeight { Excluded(ref term_val) => term_stream_builder.lt(term_val), Unbounded => term_stream_builder, }; + #[cfg(feature = "quickwit")] + if let Some(limit) = self.limit { + term_stream_builder = term_stream_builder.limit(limit); + } term_stream_builder.into_stream() } } @@ -364,7 +385,14 @@ impl Weight for RangeWeight { let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?; let term_dict = inverted_index.terms(); let mut term_range = self.term_range(term_dict)?; + let mut processed_count = 0; while term_range.advance() { + if let Some(limit) = self.limit { + if limit <= processed_count { + break; + } + } + processed_count += 1; let term_info = term_range.value(); let mut block_segment_postings = inverted_index .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; @@ -435,6 +463,38 @@ mod tests { Ok(()) } + #[test] + fn test_range_query_with_limit() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let year_field = schema_builder.add_u64_field("year", INDEXED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_for_tests()?; + for year in 1950u64..2017u64 { + if year == 1963 { + continue; + } + let num_docs_within_year = 10 + (year - 1950) * (year - 1950); + for _ in 0..num_docs_within_year { + index_writer.add_document(doc!(year_field => year))?; + } + } + index_writer.commit()?; + } + let reader = index.reader()?; + let searcher = reader.searcher(); + + let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64); + docs_in_the_sixties.limit(5); + + // due to the limit and no docs in 1963, it's really only 1960..=1965 + let count = searcher.search(&docs_in_the_sixties, &Count)?; + assert_eq!(count, 836); + Ok(()) + } + #[test] fn test_range_query() -> crate::Result<()> { let int_field: Field;