seek_exact + cost based intersection

Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection.
Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist.

`cost` allows to address the different DocSet types and their cost
model and is used to determine the DocSet that drives the intersection.
E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit.
They both have a higher cost than their size_hint would suggest.

Improves `size_hint` estimation for intersection and union, by having a
estimation based on random distribution with a co-location factor.

Refactor range query benchmark.

Closes #2531

*Future Work*

Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries)
Evaluate replacing `seek` with `seek_exact` to reduce code complexity
This commit is contained in:
Pascal Seitz
2024-11-06 14:00:45 +08:00
parent 40659d4d07
commit 2933cfd146
21 changed files with 741 additions and 521 deletions

View File

@@ -74,7 +74,7 @@ fnv = "1.0.7"
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.14.0"
binggan = "0.14.2"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
@@ -172,6 +172,10 @@ harness = false
name = "exists_json"
harness = false
[[bench]]
name = "range_query"
harness = false
[[bench]]
name = "and_or_queries"
harness = false

260
benches/range_query.rs Normal file
View File

@@ -0,0 +1,260 @@
use std::fmt::Display;
use std::net::Ipv6Addr;
use std::ops::RangeInclusive;
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use columnar::MonotonicallyMappableToU128;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
bench_range_query();
}
fn bench_range_query() {
let index = get_index_0_to_100();
let mut runner = BenchRunner::new();
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
runner.set_name("range_query on u64");
let field_name_and_descr: Vec<_> = vec![
("id", "Single Valued Range Field"),
("ids", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent()),
("10_percent", get_10_percent()),
("1_percent", get_1_percent()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
runner.set_name("range_query on ip");
let field_name_and_descr: Vec<_> = vec![
("ip", "Single Valued Range Field"),
("ips", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent_ip()),
("10_percent", get_10_percent_ip()),
("1_percent", get_1_percent_ip()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
}
fn test_range<T: Display>(
runner: &mut BenchRunner,
index: &Index,
field_name_and_descr: &[(&str, &str)],
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
) {
for (field, suffix) in field_name_and_descr {
let term_num_hits = vec![
("", ""),
("1_percent", "veryfew"),
("10_percent", "few"),
("90_percent", "most"),
];
let mut group = runner.new_group();
group.set_name(suffix);
// all intersect combinations
for (range_name, range) in &range_num_hits {
for (term_name, term) in &term_num_hits {
let index = &index;
let test_name = if term_name.is_empty() {
format!("id_range_hit_{}", range_name)
} else {
format!(
"id_range_hit_{}_intersect_with_term_{}",
range_name, term_name
)
};
group.register(test_name, move |_| {
let query = if term_name.is_empty() {
"".to_string()
} else {
format!("AND id_name:{}", term)
};
black_box(execute_query(field, range, &query, index));
});
}
}
group.run();
}
}
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_docs(&docs)
}
#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
pub ip: Ipv6Addr,
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default().set_fast().set_indexed(),
);
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default().set_fast().set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}
fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}
fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
struct NumHits {
count: usize,
}
impl OutputValue for NumHits {
fn column_title() -> &'static str {
"NumHits"
}
fn format(&self) -> Option<String> {
Some(self.count.to_string())
}
}
fn execute_query<T: Display>(
field: &str,
id_range: &RangeInclusive<T>,
suffix: &str,
index: &Index,
) -> NumHits {
let gen_query_inclusive = |from: &T, to: &T| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(id_range.start(), id_range.end());
execute_query_(&query, index)
}
fn execute_query_(query: &str, index: &Index) -> NumHits {
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let num_hits = searcher
.search(&query, &(TopDocs::with_limit(10), Count))
.unwrap()
.1;
NumHits { count: num_hits }
}

View File

@@ -49,6 +49,25 @@ pub trait DocSet: Send {
doc
}
/// Seeks to the target if possible and returns true if the target is in the DocSet.
///
/// Implementations may choose to advance past the target if target does not exist.
///
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
/// All wapper DocSets should forward `seek_exact` to the underlying DocSet.
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
/// which may be lower than target.
fn seek_exact(&mut self, target: DocId) -> bool {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}
/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
@@ -87,6 +106,23 @@ pub trait DocSet: Send {
/// length of the docset.
fn size_hint(&self) -> u32;
/// Returns a best-effort hint of the
/// cost to drive the docset.
///
/// By default this returns `size_hint()`.
///
/// DocSets may have vastly different cost depending on their type,
/// e.g. an intersection with 10 hits is much cheaper than
/// a phrase search with 10 hits, since it needs to load positions.
///
/// ### Future Work
/// We may want to differentiate `DocSet` costs more more granular, e.g.
/// creation_cost, advance_cost, seek_cost on to get a good estimation
/// what query types to choose.
fn cost(&self) -> u64 {
self.size_hint() as u64
}
/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
@@ -126,6 +162,10 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}
fn seek_exact(&mut self, target: DocId) -> bool {
(**self).seek_exact(target)
}
fn doc(&self) -> u32 {
(**self).doc()
}
@@ -134,6 +174,10 @@ impl DocSet for &mut dyn DocSet {
(**self).size_hint()
}
fn cost(&self) -> u64 {
(**self).cost()
}
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
(**self).count(alive_bitset)
}
@@ -154,6 +198,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target)
}
fn seek_exact(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_exact(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer)
@@ -169,6 +218,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}
fn cost(&self) -> u64 {
let unboxed: &TDocSet = self.borrow();
unboxed.cost()
}
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(alive_bitset)

View File

@@ -667,12 +667,15 @@ mod bench {
.read_postings(&TERM_D, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let mut intersection = Intersection::new(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
let mut intersection = Intersection::new(
vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
],
reader.searcher().num_docs() as u32,
);
while intersection.advance() != TERMINATED {}
});
}

View File

@@ -58,6 +58,15 @@ impl DocSet for AllScorer {
self.doc
}
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(target >= self.doc);
self.doc = target;
if self.doc >= self.max_doc {
self.doc = TERMINATED;
}
self.doc
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
if self.doc() == TERMINATED {
return 0;

View File

@@ -367,10 +367,14 @@ mod tests {
checkpoints
}
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
fn compute_checkpoints_manual(
term_scorers: Vec<TermScorer>,
n: usize,
max_doc: u32,
) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default);
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default, max_doc);
let mut limit = Score::MIN;
loop {
@@ -478,7 +482,8 @@ mod tests {
for top_k in 1..4 {
let checkpoints_for_each_pruning =
compute_checkpoints_for_each_pruning(term_scorers.clone(), top_k);
let checkpoints_manual = compute_checkpoints_manual(term_scorers.clone(), top_k);
let checkpoints_manual =
compute_checkpoints_manual(term_scorers.clone(), top_k, max_doc as u32);
assert_eq!(checkpoints_for_each_pruning.len(), checkpoints_manual.len());
for (&(left_doc, left_score), &(right_doc, right_score)) in checkpoints_for_each_pruning
.iter()

View File

@@ -42,6 +42,7 @@ where
fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
num_docs: u32,
) -> SpecializedScorer
where
TScoreCombiner: ScoreCombiner,
@@ -68,6 +69,7 @@ where
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)));
}
}
@@ -75,16 +77,19 @@ where
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
scorer: SpecializedScorer,
score_combiner_fn: impl Fn() -> TScoreCombiner,
num_docs: u32,
) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn);
let union_scorer =
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
Box::new(union_scorer)
}
SpecializedScorer::Other(scorer) => scorer,
@@ -151,6 +156,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
boost: Score,
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
) -> crate::Result<SpecializedScorer> {
let num_docs = reader.num_docs();
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
// Indicate how should clauses are combined with other clauses.
enum CombinationMethod {
@@ -169,7 +175,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
match self.minimum_number_should_match {
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
1 => {
let scorer_union = scorer_union(should_scorers, &score_combiner_fn);
let scorer_union = scorer_union(should_scorers, &score_combiner_fn, num_docs);
CombinationMethod::Required(scorer_union)
}
n if num_of_should_scorers == n => {
@@ -200,21 +206,21 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
};
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot)
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default, num_docs))
.map(|specialized_scorer: SpecializedScorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
});
let positive_scorer = match (should_opt, must_scorers) {
(CombinationMethod::Ignored, Some(must_scorers)) => {
SpecializedScorer::Other(intersect_scorers(must_scorers))
SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs))
}
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
let must_scorer = intersect_scorers(must_scorers);
let must_scorer = intersect_scorers(must_scorers, num_docs);
if self.scoring_enabled {
SpecializedScorer::Other(Box::new(
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
),
))
} else {
@@ -223,7 +229,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
}
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
must_scorers.push(into_box_scorer(should_scorer, &score_combiner_fn));
SpecializedScorer::Other(intersect_scorers(must_scorers))
SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs))
}
(CombinationMethod::Ignored, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
@@ -233,7 +239,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
};
if let Some(exclude_scorer) = exclude_scorer_opt {
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
let positive_scorer_boxed =
into_box_scorer(positive_scorer, &score_combiner_fn, num_docs);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
positive_scorer_boxed,
exclude_scorer,
@@ -246,6 +253,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let num_docs = reader.num_docs();
if self.weights.is_empty() {
Ok(Box::new(EmptyScorer))
} else if self.weights.len() == 1 {
@@ -258,12 +266,12 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
} else if self.scoring_enabled {
self.complex_scorer(reader, boost, &self.score_combiner_fn)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, &self.score_combiner_fn)
into_box_scorer(specialized_scorer, &self.score_combiner_fn, num_docs)
})
} else {
self.complex_scorer(reader, boost, DoNothingCombiner::default)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
})
}
}
@@ -293,11 +301,12 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let num_docs = reader.num_docs();
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer =
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
for_each_scorer(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
@@ -312,13 +321,14 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> {
let num_docs = reader.num_docs();
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer =
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
}
SpecializedScorer::Other(mut scorer) => {

View File

@@ -104,6 +104,9 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn seek_exact(&mut self, target: DocId) -> bool {
self.underlying.seek_exact(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
self.underlying.fill_buffer(buffer)
@@ -117,6 +120,10 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.size_hint()
}
fn cost(&self) -> u64 {
self.underlying.cost()
}
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
self.underlying.count(alive_bitset)
}

View File

@@ -130,6 +130,10 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
fn size_hint(&self) -> u32 {
self.docset.size_hint()
}
fn cost(&self) -> u64 {
self.docset.cost()
}
}
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {

View File

@@ -62,6 +62,16 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
self.current_doc = doc_id;
doc_id
}
fn seek(&mut self, target: DocId) -> DocId {
let doc_id = self.scorer.seek(target);
self.current_doc = doc_id;
doc_id
}
fn seek_exact(&mut self, target: DocId) -> bool {
let found = self.scorer.seek_exact(target);
self.current_doc = self.scorer.doc();
found
}
fn doc(&self) -> DocId {
self.current_doc
@@ -70,6 +80,10 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
fn size_hint(&self) -> u32 {
self.scorer.size_hint()
}
fn cost(&self) -> u64 {
self.scorer.cost()
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
@@ -146,6 +160,14 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
.max()
.unwrap_or(0u32)
}
fn cost(&self) -> u64 {
self.chains
.iter()
.map(|docset| docset.cost())
.max()
.unwrap_or(0u64)
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer

View File

@@ -1,3 +1,4 @@
use super::size_hint::estimate_intersection;
use crate::docset::{DocSet, TERMINATED};
use crate::query::term_query::TermScorer;
use crate::query::{EmptyScorer, Scorer};
@@ -11,14 +12,14 @@ use crate::{DocId, Score};
/// For better performance, the function uses a
/// specialized implementation if the two
/// shortest scorers are `TermScorer`s.
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Box<dyn Scorer> {
if scorers.is_empty() {
return Box::new(EmptyScorer);
}
if scorers.len() == 1 {
return scorers.pop().unwrap();
}
scorers.sort_by_key(|scorer| scorer.size_hint());
scorers.sort_by_key(|scorer| scorer.cost());
let doc = go_to_first_doc(&mut scorers[..]);
if doc == TERMINATED {
return Box::new(EmptyScorer);
@@ -34,12 +35,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docs,
});
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docs,
})
}
@@ -48,6 +51,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
left: TDocSet,
right: TDocSet,
others: Vec<TOtherDocSet>,
num_docs: u32,
}
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
@@ -66,7 +70,7 @@ fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
}
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> {
pub(crate) fn new(mut docsets: Vec<TDocSet>, num_docs: u32) -> Intersection<TDocSet, TDocSet> {
let num_docsets = docsets.len();
assert!(num_docsets >= 2);
docsets.sort_by_key(|docset| docset.size_hint());
@@ -77,6 +81,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
left,
right,
others: docsets,
num_docs,
}
}
}
@@ -95,32 +100,39 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
fn advance(&mut self) -> DocId {
let (left, right) = (&mut self.left, &mut self.right);
let mut candidate = left.advance();
if candidate == TERMINATED {
return TERMINATED;
}
'outer: loop {
loop {
// In the first part we look for a document in the intersection
// of the two rarest `DocSet` in the intersection.
loop {
let right_doc = right.seek(candidate);
candidate = left.seek(right_doc);
if candidate == right_doc {
if right.seek_exact(candidate) {
break;
}
// `left.advance().max(right.doc())` yielded a regression in the search game
// benchmark It may make sense in certain scenarios though.
candidate = left.advance();
if candidate == TERMINATED {
return TERMINATED;
}
}
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers;
for docset in self.others.iter_mut() {
let seek_doc = docset.seek(candidate);
if seek_doc > candidate {
candidate = left.seek(seek_doc);
continue 'outer;
}
// test the remaining scorers
if self
.others
.iter_mut()
.all(|docset| docset.seek_exact(candidate))
{
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
}
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
candidate = left.advance();
}
}
@@ -136,12 +148,37 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
doc
}
/// Seeks to the target if necessary and checks if the target is an exact match.
///
/// Some implementations may choose to advance past the target if beneficial for performance.
/// The return value is `true` if the target is in the docset, and `false` otherwise.
fn seek_exact(&mut self, target: DocId) -> bool {
self.left.seek_exact(target)
&& self.right.seek_exact(target)
&& self
.others
.iter_mut()
.all(|docset| docset.seek_exact(target))
}
fn doc(&self) -> DocId {
self.left.doc()
}
fn size_hint(&self) -> u32 {
self.left.size_hint()
estimate_intersection(
[self.left.size_hint(), self.right.size_hint()]
.into_iter()
.chain(self.others.iter().map(DocSet::size_hint)),
self.num_docs,
)
}
fn cost(&self) -> u64 {
// What's the best way to compute the cost of an intersection?
// For now we take the cost of the docset driver, which is the first docset.
// If there are docsets that are bad at skipping, they should also influence the cost.
self.left.cost()
}
}
@@ -159,6 +196,8 @@ where
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::Intersection;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
@@ -169,7 +208,7 @@ mod tests {
{
let left = VecDocSet::from(vec![1, 3, 9]);
let right = VecDocSet::from(vec![3, 4, 9, 18]);
let mut intersection = Intersection::new(vec![left, right]);
let mut intersection = Intersection::new(vec![left, right], 10);
assert_eq!(intersection.doc(), 3);
assert_eq!(intersection.advance(), 9);
assert_eq!(intersection.doc(), 9);
@@ -179,7 +218,7 @@ mod tests {
let a = VecDocSet::from(vec![1, 3, 9]);
let b = VecDocSet::from(vec![3, 4, 9, 18]);
let c = VecDocSet::from(vec![1, 5, 9, 111]);
let mut intersection = Intersection::new(vec![a, b, c]);
let mut intersection = Intersection::new(vec![a, b, c], 10);
assert_eq!(intersection.doc(), 9);
assert_eq!(intersection.advance(), TERMINATED);
}
@@ -189,7 +228,7 @@ mod tests {
fn test_intersection_zero() {
let left = VecDocSet::from(vec![0]);
let right = VecDocSet::from(vec![0]);
let mut intersection = Intersection::new(vec![left, right]);
let mut intersection = Intersection::new(vec![left, right], 10);
assert_eq!(intersection.doc(), 0);
assert_eq!(intersection.advance(), TERMINATED);
}
@@ -198,7 +237,7 @@ mod tests {
fn test_intersection_skip() {
let left = VecDocSet::from(vec![0, 1, 2, 4]);
let right = VecDocSet::from(vec![2, 5]);
let mut intersection = Intersection::new(vec![left, right]);
let mut intersection = Intersection::new(vec![left, right], 10);
assert_eq!(intersection.seek(2), 2);
assert_eq!(intersection.doc(), 2);
}
@@ -209,7 +248,7 @@ mod tests {
|| {
let left = VecDocSet::from(vec![4]);
let right = VecDocSet::from(vec![2, 5]);
Box::new(Intersection::new(vec![left, right]))
Box::new(Intersection::new(vec![left, right], 10))
},
vec![0, 2, 4, 5, 6],
);
@@ -219,19 +258,22 @@ mod tests {
let mut right = VecDocSet::from(vec![2, 5, 10]);
left.advance();
right.advance();
Box::new(Intersection::new(vec![left, right]))
Box::new(Intersection::new(vec![left, right], 10))
},
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
);
test_skip_against_unoptimized(
|| {
Box::new(Intersection::new(vec![
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 2, 5, 6]),
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 5, 6]),
VecDocSet::from(vec![2, 4, 5, 7, 8]),
]))
Box::new(Intersection::new(
vec![
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 2, 5, 6]),
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 5, 6]),
VecDocSet::from(vec![2, 4, 5, 7, 8]),
],
10,
))
},
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
);
@@ -242,7 +284,41 @@ mod tests {
let a = VecDocSet::from(vec![1, 3]);
let b = VecDocSet::from(vec![1, 4]);
let c = VecDocSet::from(vec![3, 9]);
let intersection = Intersection::new(vec![a, b, c]);
let intersection = Intersection::new(vec![a, b, c], 10);
assert_eq!(intersection.doc(), TERMINATED);
}
// Strategy to generate sorted and deduplicated vectors of u32 document IDs
fn sorted_deduped_vec(max_val: u32, max_size: usize) -> impl Strategy<Value = Vec<u32>> {
prop::collection::vec(0..max_val, 0..max_size).prop_map(|mut vec| {
vec.sort();
vec.dedup();
vec
})
}
proptest! {
#[test]
fn prop_test_intersection_consistency(
a in sorted_deduped_vec(100, 10),
b in sorted_deduped_vec(100, 10),
num_docs in 100u32..500u32
) {
let left = VecDocSet::from(a.clone());
let right = VecDocSet::from(b.clone());
let mut intersection = Intersection::new(vec![left, right], num_docs);
let expected: Vec<u32> = a.iter()
.cloned()
.filter(|doc| b.contains(doc))
.collect();
for expected_doc in expected {
assert_eq!(intersection.doc(), expected_doc);
intersection.advance();
}
assert_eq!(intersection.doc(), TERMINATED);
}
}
}

View File

@@ -23,6 +23,7 @@ mod regex_query;
mod reqopt_scorer;
mod scorer;
mod set_query;
mod size_hint;
mod term_query;
mod union;
mod weight;

View File

@@ -193,6 +193,14 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
self.advance()
}
fn seek_exact(&mut self, target: DocId) -> bool {
if self.phrase_scorer.seek_exact(target) {
self.matches_prefix()
} else {
false
}
}
fn doc(&self) -> DocId {
self.phrase_scorer.doc()
}
@@ -200,6 +208,10 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
fn size_hint(&self) -> u32 {
self.phrase_scorer.size_hint()
}
fn cost(&self) -> u64 {
self.phrase_scorer.cost()
}
}
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {

View File

@@ -368,6 +368,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
slop: u32,
offset: usize,
) -> PhraseScorer<TPostings> {
let num_docs = fieldnorm_reader.num_docs();
let max_offset = term_postings_with_offset
.iter()
.map(|&(offset, _)| offset)
@@ -381,8 +382,9 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
let intersection_docset = Intersection::new(postings_with_offsets, num_docs);
let mut scorer = PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
intersection_docset,
num_terms: num_docsets,
left_positions: Vec::with_capacity(100),
right_positions: Vec::with_capacity(100),
@@ -528,12 +530,35 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
self.advance()
}
fn seek_exact(&mut self, target: DocId) -> bool {
debug_assert!(target >= self.doc());
if self.intersection_docset.seek_exact(target) && self.phrase_match() {
return true;
}
false
}
fn doc(&self) -> DocId {
self.intersection_docset.doc()
}
fn size_hint(&self) -> u32 {
self.intersection_docset.size_hint()
// We adjust the intersection estimate, since actual phrase hits are much lower than where
// the all appear.
// The estimate should depend on average field length, e.g. if the field is really short
// a phrase hit is more likely
self.intersection_docset.size_hint() / (10 * self.num_terms as u32)
}
/// Returns a best-effort hint of the
/// cost to drive the docset.
fn cost(&self) -> u64 {
// While determing a potential hit is cheap for phrases, evaluating an actual hit is
// expensive since it requires to load positions for a doc and check if they are next to
// each other.
// So the cost estimation would be the number of times we need to check if a doc is a hit *
// 10 * self.num_terms.
self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64
}
}

View File

@@ -81,6 +81,9 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
if self.next_fetch_start >= self.column.num_docs() {
return;
}
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
@@ -105,10 +108,10 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.column.num_docs();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
let num_docs = self.column.num_docs();
let mut fetch_end = self.next_fetch_start + horizon;
if fetch_end >= num_docs {
fetch_end = num_docs;
finished_to_end = true;
}
@@ -116,7 +119,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.next_fetch_start..fetch_end,
doc_buffer,
);
if let Some(last_doc) = last_doc {
@@ -124,7 +127,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
self.loaded_docs.next();
}
}
self.next_fetch_start = end;
self.next_fetch_start = fetch_end;
finished_to_end
}
@@ -136,9 +139,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
if let Some(docid) = self.loaded_docs.next() {
return docid;
}
if self.next_fetch_start >= self.column.num_docs() {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
@@ -174,7 +174,18 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
}
fn size_hint(&self) -> u32 {
self.column.num_docs()
// TODO: Implement a better size hint
self.column.num_docs() / 10
}
/// Returns a best-effort hint of the
/// cost to drive the docset.
fn cost(&self) -> u64 {
// Advancing the docset is pretty expensive since it scans the whole column, there is no
// index currently (will change with an kd-tree)
// Since we use SIMD to scan the fast field range query we lower the cost a little bit.
// Ideally this would take the fast field codec into account
(self.column.num_docs() as f64 * 0.8) as u64
}
}

View File

@@ -1574,449 +1574,3 @@ pub(crate) mod ip_range_tests {
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
}
})
.collect();
create_index_from_docs(&docs, false)
}
fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}
fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}
fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}
fn execute_query(
field: &str,
id_range: RangeInclusive<u64>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &u64, to: &u64| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(id_range.start(), id_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_id_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench_ip {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::ip_range_tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_ip_docs(&docs)
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn execute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}

View File

@@ -56,6 +56,11 @@ where
self.req_scorer.seek(target)
}
fn seek_exact(&mut self, target: DocId) -> bool {
self.score_cache = None;
self.req_scorer.seek_exact(target)
}
fn doc(&self) -> DocId {
self.req_scorer.doc()
}
@@ -63,6 +68,10 @@ where
fn size_hint(&self) -> u32 {
self.req_scorer.size_hint()
}
fn cost(&self) -> u64 {
self.req_scorer.cost()
}
}
impl<TReqScorer, TOptScorer, TScoreCombiner> Scorer

138
src/query/size_hint.rs Normal file
View File

@@ -0,0 +1,138 @@
/// Computes the estimated number of documents in the intersection of multiple docsets
/// given their sizes.
///
/// # Arguments
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
/// segment.
///
/// # Returns
/// The estimated number of documents in the intersection.
pub fn estimate_intersection<I>(mut docset_sizes: I, max_docs: u32) -> u32
where I: Iterator<Item = u32> {
// Terms tend to be not really randomly distributed.
// This factor is used to adjust the estimate.
let mut co_loc_factor: f64 = 1.3;
let mut intersection_estimate = match docset_sizes.next() {
Some(first_size) => first_size as f64,
None => return 0, // No docsets provided, so return 0.
};
let mut smallest_docset_size = intersection_estimate;
// Assuming random distribution of terms, the probability of a document being in the
// intersection
for size in docset_sizes {
// Diminish the co-location factor for each additional set, or we will overestimate.
co_loc_factor = (co_loc_factor - 0.1).max(1.0);
intersection_estimate *= (size as f64 / max_docs as f64) * co_loc_factor;
smallest_docset_size = smallest_docset_size.min(size as f64);
}
intersection_estimate.round().min(smallest_docset_size) as u32
}
/// Computes the estimated number of documents in the union of multiple docsets
/// given their sizes.
///
/// # Arguments
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
/// segment.
///
/// # Returns
/// The estimated number of documents in the union.
pub fn estimate_union<I>(docset_sizes: I, max_docs: u32) -> u32
where I: Iterator<Item = u32> {
// Terms tend to be not really randomly distributed.
// This factor is used to adjust the estimate.
// Unlike intersection, the co-location reduces the estimate.
let co_loc_factor = 0.8;
// The approach for union is to compute the probability of a document not being in any of the
// sets
let mut not_in_any_set_prob = 1.0;
// Assuming random distribution of terms, the probability of a document being in the
// union is the complement of the probability of it not being in any of the sets.
for size in docset_sizes {
let prob_in_set = (size as f64 / max_docs as f64) * co_loc_factor;
not_in_any_set_prob *= 1.0 - prob_in_set;
}
let union_estimate = (max_docs as f64 * (1.0 - not_in_any_set_prob)).round();
union_estimate.min(u32::MAX as f64) as u32
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_estimate_intersection_small1() {
let docset_sizes = &[500, 1000];
let n = 10_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 60);
}
#[test]
fn test_estimate_intersection_small2() {
let docset_sizes = &[500, 1000, 1500];
let n = 10_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 10);
}
#[test]
fn test_estimate_intersection_large_values() {
let docset_sizes = &[100_000, 50_000, 30_000];
let n = 1_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
assert_eq!(result, 198);
}
#[test]
fn test_estimate_union_small() {
let docset_sizes = &[500, 1000, 1500];
let n = 10000;
let result = estimate_union(docset_sizes.iter().copied(), n);
assert_eq!(result, 2228);
}
#[test]
fn test_estimate_union_large_values() {
let docset_sizes = &[100000, 50000, 30000];
let n = 1000000;
let result = estimate_union(docset_sizes.iter().copied(), n);
assert_eq!(result, 137997);
}
#[test]
fn test_estimate_intersection_large() {
let docset_sizes: Vec<_> = (0..10).map(|_| 4_000_000).collect();
let n = 5_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 708_670);
}
#[test]
fn test_estimate_intersection_overflow_safety() {
let docset_sizes: Vec<_> = (0..100).map(|_| 4_000_000).collect();
let n = 5_000_000;
let result = estimate_intersection(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 0);
}
#[test]
fn test_estimate_union_overflow_safety() {
let docset_sizes: Vec<_> = (0..100).map(|_| 1_000_000).collect();
let n = 20_000_000;
let result = estimate_union(docset_sizes.iter().copied(), n);
// Check that it doesn't overflow and returns a reasonable result
assert_eq!(result, 19_662_594);
}
}

View File

@@ -2,6 +2,7 @@ use common::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::size_hint::estimate_union;
use crate::query::Scorer;
use crate::{DocId, Score};
@@ -50,6 +51,7 @@ pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
doc: DocId,
/// Combined score for current `doc` as produced by `TScoreCombiner`.
score: Score,
num_docs: u32,
}
fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
@@ -81,6 +83,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
pub(crate) fn build(
docsets: Vec<TScorer>,
score_combiner_fn: impl FnOnce() -> TScoreCombiner,
num_docs: u32,
) -> BufferedUnionScorer<TScorer, TScoreCombiner> {
let non_empty_docsets: Vec<TScorer> = docsets
.into_iter()
@@ -94,6 +97,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
window_start_doc: 0,
doc: 0,
score: 0.0,
num_docs,
};
if union.refill() {
union.advance();
@@ -211,20 +215,19 @@ where
}
}
// TODO Also implement `count` with deletes efficiently.
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
estimate_union(self.docsets.iter().map(DocSet::size_hint), self.num_docs)
}
fn cost(&self) -> u64 {
self.docsets.iter().map(DocSet::cost).sum()
}
// TODO Also implement `count` with deletes efficiently.
fn count_including_deleted(&mut self) -> u32 {
if self.doc == TERMINATED {
return 0;

View File

@@ -27,11 +27,17 @@ mod tests {
docs_list.iter().cloned().map(VecDocSet::from)
}
fn union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
let max_doc = docs_list
.iter()
.flat_map(|docs| docs.iter().copied())
.max()
.unwrap_or(0);
Box::new(BufferedUnionScorer::build(
vec_doc_set_from_docs_list(docs_list)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<ConstScorer<VecDocSet>>>(),
DoNothingCombiner::default,
max_doc,
))
}
@@ -273,6 +279,7 @@ mod bench {
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
DoNothingCombiner::default,
100_000,
);
while v.doc() != TERMINATED {
v.advance();
@@ -294,6 +301,7 @@ mod bench {
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
DoNothingCombiner::default,
100_000,
);
while v.doc() != TERMINATED {
v.advance();

View File

@@ -92,6 +92,7 @@ impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
}
fn size_hint(&self) -> u32 {
// TODO: use estimate_union
self.docsets
.iter()
.map(|docset| docset.size_hint())
@@ -99,6 +100,10 @@ impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
.unwrap_or(0u32)
}
fn cost(&self) -> u64 {
self.docsets.iter().map(|docset| docset.cost()).sum()
}
fn count_including_deleted(&mut self) -> u32 {
if self.doc == TERMINATED {
return 0u32;