From e154663158240a8ca2fbc080ba602560028dfb76 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 6 Nov 2024 14:00:45 +0800 Subject: [PATCH] seek_exact + cost based intersection Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection. Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist. `cost` allows to address the different DocSet types and their cost model and is used to determine the DocSet that drives the intersection. E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit. They both have a higher cost than their size_hint would suggest. Improves `size_hint` estimation for intersection and union, by having a estimation based on random distribution with a co-location factor. Refactor range query benchmark. Closes #2531 *Future Work* Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries) Evaluate replacing `seek` with `seek_exact` to reduce code complexity --- Cargo.toml | 6 +- benches/range_query.rs | 260 ++++++++++ src/docset.rs | 37 ++ src/query/all_query.rs | 9 + src/query/boolean_query/block_wand.rs | 2 +- src/query/boolean_query/boolean_weight.rs | 2 + src/query/boost_query.rs | 3 + src/query/disjunction.rs | 10 + src/query/intersection.rs | 87 +++- .../phrase_prefix_scorer.rs | 8 + src/query/phrase_query/phrase_scorer.rs | 28 +- .../range_query/fast_field_range_doc_set.rs | 31 +- .../range_query/range_query_fastfield.rs | 446 ------------------ src/query/reqopt_scorer.rs | 5 + src/query/union/buffered_union.rs | 3 +- src/query/union/simple_union.rs | 1 + 16 files changed, 461 insertions(+), 477 deletions(-) create mode 100644 benches/range_query.rs diff --git a/Cargo.toml b/Cargo.toml index dfbb1ea1c..57536482d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,7 @@ typetag = "0.2.21" winapi = "0.3.9" [dev-dependencies] -binggan = "0.14.0" +binggan = "0.14.2" rand = "0.8.5" maplit = "1.0.2" matches = "0.1.9" @@ -173,6 +173,10 @@ harness = false name = "exists_json" harness = false +[[bench]] +name = "range_query" +harness = false + [[bench]] name = "and_or_queries" harness = false diff --git a/benches/range_query.rs b/benches/range_query.rs new file mode 100644 index 000000000..255b35ef3 --- /dev/null +++ b/benches/range_query.rs @@ -0,0 +1,260 @@ +use std::fmt::Display; +use std::net::Ipv6Addr; +use std::ops::RangeInclusive; + +use binggan::plugins::PeakMemAllocPlugin; +use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use columnar::MonotonicallyMappableToU128; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy::collector::{Count, TopDocs}; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::{doc, Index}; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +fn main() { + bench_range_query(); +} + +fn bench_range_query() { + let index = get_index_0_to_100(); + let mut runner = BenchRunner::new(); + runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL)); + + runner.set_name("range_query on u64"); + let field_name_and_descr: Vec<_> = vec![ + ("id", "Single Valued Range Field"), + ("ids", "Multi Valued Range Field"), + ]; + let range_num_hits = vec![ + ("90_percent", get_90_percent()), + ("10_percent", get_10_percent()), + ("1_percent", get_1_percent()), + ]; + + test_range(&mut runner, &index, &field_name_and_descr, range_num_hits); + + runner.set_name("range_query on ip"); + let field_name_and_descr: Vec<_> = vec![ + ("ip", "Single Valued Range Field"), + ("ips", "Multi Valued Range Field"), + ]; + let range_num_hits = vec![ + ("90_percent", get_90_percent_ip()), + ("10_percent", get_10_percent_ip()), + ("1_percent", get_1_percent_ip()), + ]; + + test_range(&mut runner, &index, &field_name_and_descr, range_num_hits); +} + +fn test_range( + runner: &mut BenchRunner, + index: &Index, + field_name_and_descr: &[(&str, &str)], + range_num_hits: Vec<(&str, RangeInclusive)>, +) { + for (field, suffix) in field_name_and_descr { + let term_num_hits = vec![ + ("", ""), + ("1_percent", "veryfew"), + ("10_percent", "few"), + ("90_percent", "most"), + ]; + let mut group = runner.new_group(); + group.set_name(suffix); + // all intersect combinations + for (range_name, range) in &range_num_hits { + for (term_name, term) in &term_num_hits { + let index = &index; + let test_name = if term_name.is_empty() { + format!("id_range_hit_{}", range_name) + } else { + format!( + "id_range_hit_{}_intersect_with_term_{}", + range_name, term_name + ) + }; + group.register(test_name, move |_| { + let query = if term_name.is_empty() { + "".to_string() + } else { + format!("AND id_name:{}", term) + }; + black_box(execute_query(field, range, &query, index)); + }); + } + } + group.run(); + } +} + +fn get_index_0_to_100() -> Index { + let mut rng = StdRng::from_seed([1u8; 32]); + let num_vals = 100_000; + let docs: Vec<_> = (0..num_vals) + .map(|_i| { + let id_name = if rng.gen_bool(0.01) { + "veryfew".to_string() // 1% + } else if rng.gen_bool(0.1) { + "few".to_string() // 9% + } else { + "most".to_string() // 90% + }; + Doc { + id_name, + id: rng.gen_range(0..100), + // Multiply by 1000, so that we create most buckets in the compact space + // The benches depend on this range to select n-percent of elements with the + // methods below. + ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), + } + }) + .collect(); + + create_index_from_docs(&docs) +} + +#[derive(Clone, Debug)] +pub struct Doc { + pub id_name: String, + pub id: u64, + pub ip: Ipv6Addr, +} + +pub fn create_index_from_docs(docs: &[Doc]) -> Index { + let mut schema_builder = Schema::builder(); + let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST); + let ids_u64_field = + schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed()); + + let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST); + let ids_f64_field = schema_builder.add_f64_field( + "ids_f64", + NumericOptions::default().set_fast().set_indexed(), + ); + + let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST); + let ids_i64_field = schema_builder.add_i64_field( + "ids_i64", + NumericOptions::default().set_fast().set_indexed(), + ); + + let text_field = schema_builder.add_text_field("id_name", STRING | STORED); + let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST); + + let ip_field = schema_builder.add_ip_addr_field("ip", FAST); + let ips_field = schema_builder.add_ip_addr_field("ips", FAST); + + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + for doc in docs.iter() { + index_writer + .add_document(doc!( + ids_i64_field => doc.id as i64, + ids_i64_field => doc.id as i64, + ids_f64_field => doc.id as f64, + ids_f64_field => doc.id as f64, + ids_u64_field => doc.id, + ids_u64_field => doc.id, + id_u64_field => doc.id, + id_f64_field => doc.id as f64, + id_i64_field => doc.id as i64, + text_field => doc.id_name.to_string(), + text_field2 => doc.id_name.to_string(), + ips_field => doc.ip, + ips_field => doc.ip, + ip_field => doc.ip, + )) + .unwrap(); + } + + index_writer.commit().unwrap(); + } + index +} + +fn get_90_percent() -> RangeInclusive { + 0..=90 +} + +fn get_10_percent() -> RangeInclusive { + 0..=10 +} + +fn get_1_percent() -> RangeInclusive { + 10..=10 +} + +fn get_90_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(90 * 1000); + start..=end +} + +fn get_10_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end +} + +fn get_1_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(10 * 1000); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end +} + +struct NumHits { + count: usize, +} +impl OutputValue for NumHits { + fn column_title() -> &'static str { + "NumHits" + } + fn format(&self) -> Option { + Some(self.count.to_string()) + } +} + +fn execute_query( + field: &str, + id_range: &RangeInclusive, + suffix: &str, + index: &Index, +) -> NumHits { + let gen_query_inclusive = |from: &T, to: &T| { + format!( + "{}:[{} TO {}] {}", + field, + &from.to_string(), + &to.to_string(), + suffix + ) + }; + + let query = gen_query_inclusive(id_range.start(), id_range.end()); + execute_query_(&query, index) +} + +fn execute_query_(query: &str, index: &Index) -> NumHits { + let query_from_text = |text: &str| { + QueryParser::for_index(index, vec![]) + .parse_query(text) + .unwrap() + }; + let query = query_from_text(query); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let num_hits = searcher + .search(&query, &(TopDocs::with_limit(10), Count)) + .unwrap() + .1; + NumHits { count: num_hits } +} diff --git a/src/docset.rs b/src/docset.rs index 7de138da6..be71e7422 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -49,6 +49,25 @@ pub trait DocSet: Send { doc } + /// Seeks to the target if possible and returns true if the target is in the DocSet. + /// + /// Implementations may choose to advance past the target if target does not exist. + /// + /// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`. + /// All wapper DocSets should forward `seek_exact` to the underlying DocSet. + /// + /// ## API Behaviour + /// If `seek_exact` is returning true, a call to `doc()` has to return target. + /// If `seek_exact` is returning false, a call to `doc()` may return the previous doc, + /// which may be lower than target. + fn seek_exact(&mut self, target: DocId) -> bool { + let current_doc = self.doc(); + if current_doc < target { + self.seek(target); + } + self.doc() == target + } + /// Fills a given mutable buffer with the next doc ids from the /// `DocSet` /// @@ -94,6 +113,15 @@ pub trait DocSet: Send { /// which would be the number of documents in the DocSet. /// /// By default this returns `size_hint()`. + /// + /// DocSets may have vastly different cost depending on their type, + /// e.g. an intersection with 10 hits is much cheaper than + /// a phrase search with 10 hits, since it needs to load positions. + /// + /// ### Future Work + /// We may want to differentiate `DocSet` costs more more granular, e.g. + /// creation_cost, advance_cost, seek_cost on to get a good estimation + /// what query types to choose. fn cost(&self) -> u64 { self.size_hint() as u64 } @@ -137,6 +165,10 @@ impl DocSet for &mut dyn DocSet { (**self).seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + (**self).seek_exact(target) + } + fn doc(&self) -> u32 { (**self).doc() } @@ -169,6 +201,11 @@ impl DocSet for Box { unboxed.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + let unboxed: &mut TDocSet = self.borrow_mut(); + unboxed.seek_exact(target) + } + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.fill_buffer(buffer) diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 16a83ec56..612d7408c 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -62,6 +62,15 @@ impl DocSet for AllScorer { self.doc } + fn seek(&mut self, target: DocId) -> DocId { + debug_assert!(target >= self.doc); + self.doc = target; + if self.doc >= self.max_doc { + self.doc = TERMINATED; + } + self.doc + } + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { if self.doc() == TERMINATED { return 0; diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index c6710b09c..6b2f2d6e3 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -483,7 +483,7 @@ mod tests { let checkpoints_for_each_pruning = compute_checkpoints_for_each_pruning(term_scorers.clone(), top_k); let checkpoints_manual = - compute_checkpoints_manual(term_scorers.clone(), top_k, 100_000); + compute_checkpoints_manual(term_scorers.clone(), top_k, max_doc as u32); assert_eq!(checkpoints_for_each_pruning.len(), checkpoints_manual.len()); for (&(left_doc, left_score), &(right_doc, right_score)) in checkpoints_for_each_pruning .iter() diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index c46e9b0b1..96ce36994 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -467,6 +467,7 @@ impl Weight for BooleanWeight crate::Result<()> { + let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { @@ -489,6 +490,7 @@ impl Weight for BooleanWeight crate::Result<()> { + let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 06678287f..da442b323 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -104,6 +104,9 @@ impl DocSet for BoostScorer { fn seek(&mut self, target: DocId) -> DocId { self.underlying.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + self.underlying.seek_exact(target) + } fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { self.underlying.fill_buffer(buffer) diff --git a/src/query/disjunction.rs b/src/query/disjunction.rs index 910e207df..f34394e0f 100644 --- a/src/query/disjunction.rs +++ b/src/query/disjunction.rs @@ -62,6 +62,16 @@ impl DocSet for ScorerWrapper { self.current_doc = doc_id; doc_id } + fn seek(&mut self, target: DocId) -> DocId { + let doc_id = self.scorer.seek(target); + self.current_doc = doc_id; + doc_id + } + fn seek_exact(&mut self, target: DocId) -> bool { + let found = self.scorer.seek_exact(target); + self.current_doc = self.scorer.doc(); + found + } fn doc(&self) -> DocId { self.current_doc diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 10e257c43..419f2c5ff 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,3 +1,4 @@ +use super::size_hint::estimate_intersection; use crate::docset::{DocSet, TERMINATED}; use crate::query::size_hint::estimate_intersection; use crate::query::term_query::TermScorer; @@ -105,32 +106,39 @@ impl DocSet for Intersection DocId { let (left, right) = (&mut self.left, &mut self.right); let mut candidate = left.advance(); + if candidate == TERMINATED { + return TERMINATED; + } - 'outer: loop { + loop { // In the first part we look for a document in the intersection // of the two rarest `DocSet` in the intersection. loop { - let right_doc = right.seek(candidate); - candidate = left.seek(right_doc); - if candidate == right_doc { + if right.seek_exact(candidate) { break; } + // `left.advance().max(right.doc())` yielded a regression in the search game + // benchmark It may make sense in certain scenarios though. + candidate = left.advance(); + if candidate == TERMINATED { + return TERMINATED; + } } debug_assert_eq!(left.doc(), right.doc()); - // test the remaining scorers; - for docset in self.others.iter_mut() { - let seek_doc = docset.seek(candidate); - if seek_doc > candidate { - candidate = left.seek(seek_doc); - continue 'outer; - } + // test the remaining scorers + if self + .others + .iter_mut() + .all(|docset| docset.seek_exact(candidate)) + { + debug_assert_eq!(candidate, self.left.doc()); + debug_assert_eq!(candidate, self.right.doc()); + debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate)); + return candidate; } - debug_assert_eq!(candidate, self.left.doc()); - debug_assert_eq!(candidate, self.right.doc()); - debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate)); - return candidate; + candidate = left.advance(); } } @@ -146,6 +154,19 @@ impl DocSet for Intersection bool { + self.left.seek_exact(target) + && self.right.seek_exact(target) + && self + .others + .iter_mut() + .all(|docset| docset.seek_exact(target)) + } + fn doc(&self) -> DocId { self.left.doc() } @@ -181,6 +202,8 @@ where #[cfg(test)] mod tests { + use proptest::prelude::*; + use super::Intersection; use crate::docset::{DocSet, TERMINATED}; use crate::postings::tests::test_skip_against_unoptimized; @@ -270,4 +293,38 @@ mod tests { let intersection = Intersection::new(vec![a, b, c], 10); assert_eq!(intersection.doc(), TERMINATED); } + + // Strategy to generate sorted and deduplicated vectors of u32 document IDs + fn sorted_deduped_vec(max_val: u32, max_size: usize) -> impl Strategy> { + prop::collection::vec(0..max_val, 0..max_size).prop_map(|mut vec| { + vec.sort(); + vec.dedup(); + vec + }) + } + + proptest! { + #[test] + fn prop_test_intersection_consistency( + a in sorted_deduped_vec(100, 10), + b in sorted_deduped_vec(100, 10), + num_docs in 100u32..500u32 + ) { + let left = VecDocSet::from(a.clone()); + let right = VecDocSet::from(b.clone()); + let mut intersection = Intersection::new(vec![left, right], num_docs); + + let expected: Vec = a.iter() + .cloned() + .filter(|doc| b.contains(doc)) + .collect(); + + for expected_doc in expected { + assert_eq!(intersection.doc(), expected_doc); + intersection.advance(); + } + assert_eq!(intersection.doc(), TERMINATED); + } + + } } diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index 14933f3ae..aa6f26d26 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -193,6 +193,14 @@ impl DocSet for PhrasePrefixScorer { self.advance() } + fn seek_exact(&mut self, target: DocId) -> bool { + if self.phrase_scorer.seek_exact(target) { + self.matches_prefix() + } else { + false + } + } + fn doc(&self) -> DocId { self.phrase_scorer.doc() } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 12a94dce3..f0f87c9fe 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -382,8 +382,9 @@ impl PhraseScorer { PostingsWithOffset::new(postings, (max_offset - offset) as u32) }) .collect::>(); + let intersection_docset = Intersection::new(postings_with_offsets, num_docs); let mut scorer = PhraseScorer { - intersection_docset: Intersection::new(postings_with_offsets, num_docs), + intersection_docset, num_terms: num_docsets, left_positions: Vec::with_capacity(100), right_positions: Vec::with_capacity(100), @@ -529,12 +530,35 @@ impl DocSet for PhraseScorer { self.advance() } + fn seek_exact(&mut self, target: DocId) -> bool { + debug_assert!(target >= self.doc()); + if self.intersection_docset.seek_exact(target) && self.phrase_match() { + return true; + } + false + } + fn doc(&self) -> DocId { self.intersection_docset.doc() } fn size_hint(&self) -> u32 { - self.intersection_docset.size_hint() + // We adjust the intersection estimate, since actual phrase hits are much lower than where + // the all appear. + // The estimate should depend on average field length, e.g. if the field is really short + // a phrase hit is more likely + self.intersection_docset.size_hint() / (10 * self.num_terms as u32) + } + + /// Returns a best-effort hint of the + /// cost to drive the docset. + fn cost(&self) -> u64 { + // While determing a potential hit is cheap for phrases, evaluating an actual hit is + // expensive since it requires to load positions for a doc and check if they are next to + // each other. + // So the cost estimation would be the number of times we need to check if a doc is a hit * + // 10 * self.num_terms. + self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64 } /// Returns a best-effort hint of the diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index 5a76f7e9d..ec8e97ef3 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -92,6 +92,9 @@ impl RangeDocSet { /// Returns true if more data could be fetched fn fetch_block(&mut self) { + if self.next_fetch_start >= self.column.num_docs() { + return; + } const MAX_HORIZON: u32 = 100_000; while self.loaded_docs.is_empty() { let finished_to_end = self.fetch_horizon(self.fetch_horizon); @@ -116,10 +119,10 @@ impl RangeDocSet { fn fetch_horizon(&mut self, horizon: u32) -> bool { let mut finished_to_end = false; - let limit = self.column.num_docs(); - let mut end = self.next_fetch_start + horizon; - if end >= limit { - end = limit; + let num_docs = self.column.num_docs(); + let mut fetch_end = self.next_fetch_start + horizon; + if fetch_end >= num_docs { + fetch_end = num_docs; finished_to_end = true; } @@ -127,7 +130,7 @@ impl RangeDocSet { let doc_buffer: &mut Vec = self.loaded_docs.get_cleared_data(); self.column.get_docids_for_value_range( self.value_range.clone(), - self.next_fetch_start..end, + self.next_fetch_start..fetch_end, doc_buffer, ); if let Some(last_doc) = last_doc { @@ -135,7 +138,7 @@ impl RangeDocSet { self.loaded_docs.next(); } } - self.next_fetch_start = end; + self.next_fetch_start = fetch_end; finished_to_end } @@ -147,9 +150,6 @@ impl DocSet for RangeDocSe if let Some(docid) = self.loaded_docs.next() { return docid; } - if self.next_fetch_start >= self.column.num_docs() { - return TERMINATED; - } self.fetch_block(); self.loaded_docs.current().unwrap_or(TERMINATED) } @@ -185,7 +185,18 @@ impl DocSet for RangeDocSe } fn size_hint(&self) -> u32 { - self.column.num_docs() + // TODO: Implement a better size hint + self.column.num_docs() / 10 + } + + /// Returns a best-effort hint of the + /// cost to drive the docset. + fn cost(&self) -> u64 { + // Advancing the docset is pretty expensive since it scans the whole column, there is no + // index currently (will change with an kd-tree) + // Since we use SIMD to scan the fast field range query we lower the cost a little bit. + // Ideally this would take the fast field codec into account + (self.column.num_docs() as f64 * 0.8) as u64 } /// Returns a best-effort hint of the diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index 54cf0cad5..e379e108e 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -1598,449 +1598,3 @@ pub(crate) mod ip_range_tests { Ok(()) } } - -#[cfg(all(test, feature = "unstable"))] -mod bench { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::tests::*; - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::Index; - - fn get_index_0_to_100() -> Index { - let mut rng = StdRng::from_seed([1u8; 32]); - let num_vals = 100_000; - let docs: Vec<_> = (0..num_vals) - .map(|_i| { - let id_name = if rng.gen_bool(0.01) { - "veryfew".to_string() // 1% - } else if rng.gen_bool(0.1) { - "few".to_string() // 9% - } else { - "many".to_string() // 90% - }; - Doc { - id_name, - id: rng.gen_range(0..100), - } - }) - .collect(); - - create_index_from_docs(&docs, false) - } - - fn get_90_percent() -> RangeInclusive { - 0..=90 - } - - fn get_10_percent() -> RangeInclusive { - 0..=10 - } - - fn get_1_percent() -> RangeInclusive { - 10..=10 - } - - fn execute_query( - field: &str, - id_range: RangeInclusive, - suffix: &str, - index: &Index, - ) -> usize { - let gen_query_inclusive = |from: &u64, to: &u64| { - format!( - "{}:[{} TO {}] {}", - field, - &from.to_string(), - &to.to_string(), - suffix - ) - }; - - let query = gen_query_inclusive(id_range.start(), id_range.end()); - let query_from_text = |text: &str| { - QueryParser::for_index(index, vec![]) - .parse_query(text) - .unwrap() - }; - let query = query_from_text(&query); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - searcher.search(&query, &(Count)).unwrap() - } - - #[bench] - fn bench_id_range_hit_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index)); - } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench_ip { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::ip_range_tests::*; - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::Index; - - fn get_index_0_to_100() -> Index { - let mut rng = StdRng::from_seed([1u8; 32]); - let num_vals = 100_000; - let docs: Vec<_> = (0..num_vals) - .map(|_i| { - let id = if rng.gen_bool(0.01) { - "veryfew".to_string() // 1% - } else if rng.gen_bool(0.1) { - "few".to_string() // 9% - } else { - "many".to_string() // 90% - }; - Doc { - id, - // Multiply by 1000, so that we create many buckets in the compact space - // The benches depend on this range to select n-percent of elements with the - // methods below. - ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), - } - }) - .collect(); - - create_index_from_ip_docs(&docs) - } - - fn get_90_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(90 * 1000); - start..=end - } - - fn get_10_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn get_1_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(10 * 1000); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn execute_query( - field: &str, - ip_range: RangeInclusive, - suffix: &str, - index: &Index, - ) -> usize { - let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| { - format!( - "{}:[{} TO {}] {}", - field, - &from.to_string(), - &to.to_string(), - suffix - ) - }; - - let query = gen_query_inclusive(ip_range.start(), ip_range.end()); - let query_from_text = |text: &str| { - QueryParser::for_index(index, vec![]) - .parse_query(text) - .unwrap() - }; - let query = query_from_text(&query); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - searcher.search(&query, &(Count)).unwrap() - } - - #[bench] - fn bench_ip_range_hit_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:veryfew", &index)); - } -} diff --git a/src/query/reqopt_scorer.rs b/src/query/reqopt_scorer.rs index be9e14692..1217eb65a 100644 --- a/src/query/reqopt_scorer.rs +++ b/src/query/reqopt_scorer.rs @@ -56,6 +56,11 @@ where self.req_scorer.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + self.score_cache = None; + self.req_scorer.seek_exact(target) + } + fn doc(&self) -> DocId { self.req_scorer.doc() } diff --git a/src/query/union/buffered_union.rs b/src/query/union/buffered_union.rs index 3c726b8a7..42e376ef0 100644 --- a/src/query/union/buffered_union.rs +++ b/src/query/union/buffered_union.rs @@ -217,8 +217,6 @@ where } } - // TODO Also implement `count` with deletes efficiently. - fn doc(&self) -> DocId { self.doc } @@ -231,6 +229,7 @@ where self.docsets.iter().map(|docset| docset.cost()).sum() } + // TODO Also implement `count` with deletes efficiently. fn count_including_deleted(&mut self) -> u32 { if self.doc == TERMINATED { return 0; diff --git a/src/query/union/simple_union.rs b/src/query/union/simple_union.rs index 61cbb94b6..b153a7f22 100644 --- a/src/query/union/simple_union.rs +++ b/src/query/union/simple_union.rs @@ -92,6 +92,7 @@ impl DocSet for SimpleUnion { } fn size_hint(&self) -> u32 { + // TODO: use estimate_union self.docsets .iter() .map(|docset| docset.size_hint())