mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
seek_exact + cost based intersection
Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection. Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist. `cost` allows to address the different DocSet types and their cost model and is used to determine the DocSet that drives the intersection. E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit. They both have a higher cost than their size_hint would suggest. Improves `size_hint` estimation for intersection and union, by having a estimation based on random distribution with a co-location factor. Refactor range query benchmark. Closes #2531 *Future Work* Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries) Evaluate replacing `seek` with `seek_exact` to reduce code complexity
This commit is contained in:
@@ -74,7 +74,7 @@ fnv = "1.0.7"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.14.0"
|
||||
binggan = "0.14.2"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
@@ -172,6 +172,10 @@ harness = false
|
||||
name = "exists_json"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "range_query"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "and_or_queries"
|
||||
harness = false
|
||||
|
||||
260
benches/range_query.rs
Normal file
260
benches/range_query.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
use std::fmt::Display;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
fn main() {
|
||||
bench_range_query();
|
||||
}
|
||||
|
||||
fn bench_range_query() {
|
||||
let index = get_index_0_to_100();
|
||||
let mut runner = BenchRunner::new();
|
||||
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
runner.set_name("range_query on u64");
|
||||
let field_name_and_descr: Vec<_> = vec![
|
||||
("id", "Single Valued Range Field"),
|
||||
("ids", "Multi Valued Range Field"),
|
||||
];
|
||||
let range_num_hits = vec![
|
||||
("90_percent", get_90_percent()),
|
||||
("10_percent", get_10_percent()),
|
||||
("1_percent", get_1_percent()),
|
||||
];
|
||||
|
||||
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
|
||||
|
||||
runner.set_name("range_query on ip");
|
||||
let field_name_and_descr: Vec<_> = vec![
|
||||
("ip", "Single Valued Range Field"),
|
||||
("ips", "Multi Valued Range Field"),
|
||||
];
|
||||
let range_num_hits = vec![
|
||||
("90_percent", get_90_percent_ip()),
|
||||
("10_percent", get_10_percent_ip()),
|
||||
("1_percent", get_1_percent_ip()),
|
||||
];
|
||||
|
||||
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
|
||||
}
|
||||
|
||||
fn test_range<T: Display>(
|
||||
runner: &mut BenchRunner,
|
||||
index: &Index,
|
||||
field_name_and_descr: &[(&str, &str)],
|
||||
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
|
||||
) {
|
||||
for (field, suffix) in field_name_and_descr {
|
||||
let term_num_hits = vec![
|
||||
("", ""),
|
||||
("1_percent", "veryfew"),
|
||||
("10_percent", "few"),
|
||||
("90_percent", "most"),
|
||||
];
|
||||
let mut group = runner.new_group();
|
||||
group.set_name(suffix);
|
||||
// all intersect combinations
|
||||
for (range_name, range) in &range_num_hits {
|
||||
for (term_name, term) in &term_num_hits {
|
||||
let index = &index;
|
||||
let test_name = if term_name.is_empty() {
|
||||
format!("id_range_hit_{}", range_name)
|
||||
} else {
|
||||
format!(
|
||||
"id_range_hit_{}_intersect_with_term_{}",
|
||||
range_name, term_name
|
||||
)
|
||||
};
|
||||
group.register(test_name, move |_| {
|
||||
let query = if term_name.is_empty() {
|
||||
"".to_string()
|
||||
} else {
|
||||
format!("AND id_name:{}", term)
|
||||
};
|
||||
black_box(execute_query(field, range, &query, index));
|
||||
});
|
||||
}
|
||||
}
|
||||
group.run();
|
||||
}
|
||||
}
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id_name = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"most".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id_name,
|
||||
id: rng.gen_range(0..100),
|
||||
// Multiply by 1000, so that we create most buckets in the compact space
|
||||
// The benches depend on this range to select n-percent of elements with the
|
||||
// methods below.
|
||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_docs(&docs)
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
pub id_name: String,
|
||||
pub id: u64,
|
||||
pub ip: Ipv6Addr,
|
||||
}
|
||||
|
||||
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
|
||||
let ids_u64_field =
|
||||
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
|
||||
|
||||
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
|
||||
let ids_f64_field = schema_builder.add_f64_field(
|
||||
"ids_f64",
|
||||
NumericOptions::default().set_fast().set_indexed(),
|
||||
);
|
||||
|
||||
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
|
||||
let ids_i64_field = schema_builder.add_i64_field(
|
||||
"ids_i64",
|
||||
NumericOptions::default().set_fast().set_indexed(),
|
||||
);
|
||||
|
||||
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
|
||||
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
|
||||
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
||||
for doc in docs.iter() {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
ids_i64_field => doc.id as i64,
|
||||
ids_i64_field => doc.id as i64,
|
||||
ids_f64_field => doc.id as f64,
|
||||
ids_f64_field => doc.id as f64,
|
||||
ids_u64_field => doc.id,
|
||||
ids_u64_field => doc.id,
|
||||
id_u64_field => doc.id,
|
||||
id_f64_field => doc.id as f64,
|
||||
id_i64_field => doc.id as i64,
|
||||
text_field => doc.id_name.to_string(),
|
||||
text_field2 => doc.id_name.to_string(),
|
||||
ips_field => doc.ip,
|
||||
ips_field => doc.ip,
|
||||
ip_field => doc.ip,
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<u64> {
|
||||
0..=90
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<u64> {
|
||||
0..=10
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<u64> {
|
||||
10..=10
|
||||
}
|
||||
|
||||
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
struct NumHits {
|
||||
count: usize,
|
||||
}
|
||||
impl OutputValue for NumHits {
|
||||
fn column_title() -> &'static str {
|
||||
"NumHits"
|
||||
}
|
||||
fn format(&self) -> Option<String> {
|
||||
Some(self.count.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn execute_query<T: Display>(
|
||||
field: &str,
|
||||
id_range: &RangeInclusive<T>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> NumHits {
|
||||
let gen_query_inclusive = |from: &T, to: &T| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(id_range.start(), id_range.end());
|
||||
execute_query_(&query, index)
|
||||
}
|
||||
|
||||
fn execute_query_(query: &str, index: &Index) -> NumHits {
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let num_hits = searcher
|
||||
.search(&query, &(TopDocs::with_limit(10), Count))
|
||||
.unwrap()
|
||||
.1;
|
||||
NumHits { count: num_hits }
|
||||
}
|
||||
@@ -49,6 +49,25 @@ pub trait DocSet: Send {
|
||||
doc
|
||||
}
|
||||
|
||||
/// Seeks to the target if possible and returns true if the target is in the DocSet.
|
||||
///
|
||||
/// Implementations may choose to advance past the target if target does not exist.
|
||||
///
|
||||
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
|
||||
/// All wapper DocSets should forward `seek_exact` to the underlying DocSet.
|
||||
///
|
||||
/// ## API Behaviour
|
||||
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
|
||||
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
|
||||
/// which may be lower than target.
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
let current_doc = self.doc();
|
||||
if current_doc < target {
|
||||
self.seek(target);
|
||||
}
|
||||
self.doc() == target
|
||||
}
|
||||
|
||||
/// Fills a given mutable buffer with the next doc ids from the
|
||||
/// `DocSet`
|
||||
///
|
||||
@@ -87,6 +106,23 @@ pub trait DocSet: Send {
|
||||
/// length of the docset.
|
||||
fn size_hint(&self) -> u32;
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// cost to drive the docset.
|
||||
///
|
||||
/// By default this returns `size_hint()`.
|
||||
///
|
||||
/// DocSets may have vastly different cost depending on their type,
|
||||
/// e.g. an intersection with 10 hits is much cheaper than
|
||||
/// a phrase search with 10 hits, since it needs to load positions.
|
||||
///
|
||||
/// ### Future Work
|
||||
/// We may want to differentiate `DocSet` costs more more granular, e.g.
|
||||
/// creation_cost, advance_cost, seek_cost on to get a good estimation
|
||||
/// what query types to choose.
|
||||
fn cost(&self) -> u64 {
|
||||
self.size_hint() as u64
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
@@ -126,6 +162,10 @@ impl DocSet for &mut dyn DocSet {
|
||||
(**self).seek(target)
|
||||
}
|
||||
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
(**self).seek_exact(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> u32 {
|
||||
(**self).doc()
|
||||
}
|
||||
@@ -134,6 +174,10 @@ impl DocSet for &mut dyn DocSet {
|
||||
(**self).size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
(**self).cost()
|
||||
}
|
||||
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
(**self).count(alive_bitset)
|
||||
}
|
||||
@@ -154,6 +198,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.seek(target)
|
||||
}
|
||||
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.seek_exact(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.fill_buffer(buffer)
|
||||
@@ -169,6 +218,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.cost()
|
||||
}
|
||||
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count(alive_bitset)
|
||||
|
||||
@@ -667,12 +667,15 @@ mod bench {
|
||||
.read_postings(&TERM_D, IndexRecordOption::Basic)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let mut intersection = Intersection::new(vec![
|
||||
segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d,
|
||||
]);
|
||||
let mut intersection = Intersection::new(
|
||||
vec![
|
||||
segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d,
|
||||
],
|
||||
reader.searcher().num_docs() as u32,
|
||||
);
|
||||
while intersection.advance() != TERMINATED {}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -58,6 +58,15 @@ impl DocSet for AllScorer {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
debug_assert!(target >= self.doc);
|
||||
self.doc = target;
|
||||
if self.doc >= self.max_doc {
|
||||
self.doc = TERMINATED;
|
||||
}
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
if self.doc() == TERMINATED {
|
||||
return 0;
|
||||
|
||||
@@ -367,10 +367,14 @@ mod tests {
|
||||
checkpoints
|
||||
}
|
||||
|
||||
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
|
||||
fn compute_checkpoints_manual(
|
||||
term_scorers: Vec<TermScorer>,
|
||||
n: usize,
|
||||
max_doc: u32,
|
||||
) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default);
|
||||
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default, max_doc);
|
||||
|
||||
let mut limit = Score::MIN;
|
||||
loop {
|
||||
@@ -478,7 +482,8 @@ mod tests {
|
||||
for top_k in 1..4 {
|
||||
let checkpoints_for_each_pruning =
|
||||
compute_checkpoints_for_each_pruning(term_scorers.clone(), top_k);
|
||||
let checkpoints_manual = compute_checkpoints_manual(term_scorers.clone(), top_k);
|
||||
let checkpoints_manual =
|
||||
compute_checkpoints_manual(term_scorers.clone(), top_k, max_doc as u32);
|
||||
assert_eq!(checkpoints_for_each_pruning.len(), checkpoints_manual.len());
|
||||
for (&(left_doc, left_score), &(right_doc, right_score)) in checkpoints_for_each_pruning
|
||||
.iter()
|
||||
|
||||
@@ -42,6 +42,7 @@ where
|
||||
fn scorer_union<TScoreCombiner>(
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
num_docs: u32,
|
||||
) -> SpecializedScorer
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -68,6 +69,7 @@ where
|
||||
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
num_docs,
|
||||
)));
|
||||
}
|
||||
}
|
||||
@@ -75,16 +77,19 @@ where
|
||||
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
num_docs,
|
||||
)))
|
||||
}
|
||||
|
||||
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
scorer: SpecializedScorer,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
num_docs: u32,
|
||||
) -> Box<dyn Scorer> {
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn);
|
||||
let union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
|
||||
Box::new(union_scorer)
|
||||
}
|
||||
SpecializedScorer::Other(scorer) => scorer,
|
||||
@@ -151,6 +156,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
boost: Score,
|
||||
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
||||
) -> crate::Result<SpecializedScorer> {
|
||||
let num_docs = reader.num_docs();
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||
// Indicate how should clauses are combined with other clauses.
|
||||
enum CombinationMethod {
|
||||
@@ -169,7 +175,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
match self.minimum_number_should_match {
|
||||
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
|
||||
1 => {
|
||||
let scorer_union = scorer_union(should_scorers, &score_combiner_fn);
|
||||
let scorer_union = scorer_union(should_scorers, &score_combiner_fn, num_docs);
|
||||
CombinationMethod::Required(scorer_union)
|
||||
}
|
||||
n if num_of_should_scorers == n => {
|
||||
@@ -200,21 +206,21 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
};
|
||||
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::MustNot)
|
||||
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
||||
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default, num_docs))
|
||||
.map(|specialized_scorer: SpecializedScorer| {
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
|
||||
});
|
||||
let positive_scorer = match (should_opt, must_scorers) {
|
||||
(CombinationMethod::Ignored, Some(must_scorers)) => {
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs))
|
||||
}
|
||||
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
|
||||
let must_scorer = intersect_scorers(must_scorers);
|
||||
let must_scorer = intersect_scorers(must_scorers, num_docs);
|
||||
if self.scoring_enabled {
|
||||
SpecializedScorer::Other(Box::new(
|
||||
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
must_scorer,
|
||||
into_box_scorer(should_scorer, &score_combiner_fn),
|
||||
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
|
||||
),
|
||||
))
|
||||
} else {
|
||||
@@ -223,7 +229,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
}
|
||||
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
|
||||
must_scorers.push(into_box_scorer(should_scorer, &score_combiner_fn));
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||
SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs))
|
||||
}
|
||||
(CombinationMethod::Ignored, None) => {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
|
||||
@@ -233,7 +239,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
|
||||
};
|
||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
||||
let positive_scorer_boxed =
|
||||
into_box_scorer(positive_scorer, &score_combiner_fn, num_docs);
|
||||
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
||||
positive_scorer_boxed,
|
||||
exclude_scorer,
|
||||
@@ -246,6 +253,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
|
||||
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let num_docs = reader.num_docs();
|
||||
if self.weights.is_empty() {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
} else if self.weights.len() == 1 {
|
||||
@@ -258,12 +266,12 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
} else if self.scoring_enabled {
|
||||
self.complex_scorer(reader, boost, &self.score_combiner_fn)
|
||||
.map(|specialized_scorer| {
|
||||
into_box_scorer(specialized_scorer, &self.score_combiner_fn)
|
||||
into_box_scorer(specialized_scorer, &self.score_combiner_fn, num_docs)
|
||||
})
|
||||
} else {
|
||||
self.complex_scorer(reader, boost, DoNothingCombiner::default)
|
||||
.map(|specialized_scorer| {
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -293,11 +301,12 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let num_docs = reader.num_docs();
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
@@ -312,13 +321,14 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let num_docs = reader.num_docs();
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
|
||||
@@ -104,6 +104,9 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.underlying.seek(target)
|
||||
}
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
self.underlying.seek_exact(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
self.underlying.fill_buffer(buffer)
|
||||
@@ -117,6 +120,10 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
self.underlying.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.underlying.cost()
|
||||
}
|
||||
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
self.underlying.count(alive_bitset)
|
||||
}
|
||||
|
||||
@@ -130,6 +130,10 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docset.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.docset.cost()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
|
||||
|
||||
@@ -62,6 +62,16 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||
self.current_doc = doc_id;
|
||||
doc_id
|
||||
}
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let doc_id = self.scorer.seek(target);
|
||||
self.current_doc = doc_id;
|
||||
doc_id
|
||||
}
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
let found = self.scorer.seek_exact(target);
|
||||
self.current_doc = self.scorer.doc();
|
||||
found
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
@@ -70,6 +80,10 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.scorer.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.scorer.cost()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
|
||||
@@ -146,6 +160,14 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.chains
|
||||
.iter()
|
||||
.map(|docset| docset.cost())
|
||||
.max()
|
||||
.unwrap_or(0u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use super::size_hint::estimate_intersection;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{EmptyScorer, Scorer};
|
||||
@@ -11,14 +12,14 @@ use crate::{DocId, Score};
|
||||
/// For better performance, the function uses a
|
||||
/// specialized implementation if the two
|
||||
/// shortest scorers are `TermScorer`s.
|
||||
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
|
||||
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Box<dyn Scorer> {
|
||||
if scorers.is_empty() {
|
||||
return Box::new(EmptyScorer);
|
||||
}
|
||||
if scorers.len() == 1 {
|
||||
return scorers.pop().unwrap();
|
||||
}
|
||||
scorers.sort_by_key(|scorer| scorer.size_hint());
|
||||
scorers.sort_by_key(|scorer| scorer.cost());
|
||||
let doc = go_to_first_doc(&mut scorers[..]);
|
||||
if doc == TERMINATED {
|
||||
return Box::new(EmptyScorer);
|
||||
@@ -34,12 +35,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
|
||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
others: scorers,
|
||||
num_docs,
|
||||
});
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docs,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -48,6 +51,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
|
||||
left: TDocSet,
|
||||
right: TDocSet,
|
||||
others: Vec<TOtherDocSet>,
|
||||
num_docs: u32,
|
||||
}
|
||||
|
||||
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
|
||||
@@ -66,7 +70,7 @@ fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> {
|
||||
pub(crate) fn new(mut docsets: Vec<TDocSet>, num_docs: u32) -> Intersection<TDocSet, TDocSet> {
|
||||
let num_docsets = docsets.len();
|
||||
assert!(num_docsets >= 2);
|
||||
docsets.sort_by_key(|docset| docset.size_hint());
|
||||
@@ -77,6 +81,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
left,
|
||||
right,
|
||||
others: docsets,
|
||||
num_docs,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -95,32 +100,39 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
fn advance(&mut self) -> DocId {
|
||||
let (left, right) = (&mut self.left, &mut self.right);
|
||||
let mut candidate = left.advance();
|
||||
if candidate == TERMINATED {
|
||||
return TERMINATED;
|
||||
}
|
||||
|
||||
'outer: loop {
|
||||
loop {
|
||||
// In the first part we look for a document in the intersection
|
||||
// of the two rarest `DocSet` in the intersection.
|
||||
|
||||
loop {
|
||||
let right_doc = right.seek(candidate);
|
||||
candidate = left.seek(right_doc);
|
||||
if candidate == right_doc {
|
||||
if right.seek_exact(candidate) {
|
||||
break;
|
||||
}
|
||||
// `left.advance().max(right.doc())` yielded a regression in the search game
|
||||
// benchmark It may make sense in certain scenarios though.
|
||||
candidate = left.advance();
|
||||
if candidate == TERMINATED {
|
||||
return TERMINATED;
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert_eq!(left.doc(), right.doc());
|
||||
// test the remaining scorers;
|
||||
for docset in self.others.iter_mut() {
|
||||
let seek_doc = docset.seek(candidate);
|
||||
if seek_doc > candidate {
|
||||
candidate = left.seek(seek_doc);
|
||||
continue 'outer;
|
||||
}
|
||||
// test the remaining scorers
|
||||
if self
|
||||
.others
|
||||
.iter_mut()
|
||||
.all(|docset| docset.seek_exact(candidate))
|
||||
{
|
||||
debug_assert_eq!(candidate, self.left.doc());
|
||||
debug_assert_eq!(candidate, self.right.doc());
|
||||
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
|
||||
return candidate;
|
||||
}
|
||||
debug_assert_eq!(candidate, self.left.doc());
|
||||
debug_assert_eq!(candidate, self.right.doc());
|
||||
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
|
||||
return candidate;
|
||||
candidate = left.advance();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,12 +148,37 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
doc
|
||||
}
|
||||
|
||||
/// Seeks to the target if necessary and checks if the target is an exact match.
|
||||
///
|
||||
/// Some implementations may choose to advance past the target if beneficial for performance.
|
||||
/// The return value is `true` if the target is in the docset, and `false` otherwise.
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
self.left.seek_exact(target)
|
||||
&& self.right.seek_exact(target)
|
||||
&& self
|
||||
.others
|
||||
.iter_mut()
|
||||
.all(|docset| docset.seek_exact(target))
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.left.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.left.size_hint()
|
||||
estimate_intersection(
|
||||
[self.left.size_hint(), self.right.size_hint()]
|
||||
.into_iter()
|
||||
.chain(self.others.iter().map(DocSet::size_hint)),
|
||||
self.num_docs,
|
||||
)
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
// What's the best way to compute the cost of an intersection?
|
||||
// For now we take the cost of the docset driver, which is the first docset.
|
||||
// If there are docsets that are bad at skipping, they should also influence the cost.
|
||||
self.left.cost()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,6 +196,8 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::Intersection;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
@@ -169,7 +208,7 @@ mod tests {
|
||||
{
|
||||
let left = VecDocSet::from(vec![1, 3, 9]);
|
||||
let right = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
let mut intersection = Intersection::new(vec![left, right], 10);
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert_eq!(intersection.advance(), 9);
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
@@ -179,7 +218,7 @@ mod tests {
|
||||
let a = VecDocSet::from(vec![1, 3, 9]);
|
||||
let b = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let c = VecDocSet::from(vec![1, 5, 9, 111]);
|
||||
let mut intersection = Intersection::new(vec![a, b, c]);
|
||||
let mut intersection = Intersection::new(vec![a, b, c], 10);
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert_eq!(intersection.advance(), TERMINATED);
|
||||
}
|
||||
@@ -189,7 +228,7 @@ mod tests {
|
||||
fn test_intersection_zero() {
|
||||
let left = VecDocSet::from(vec![0]);
|
||||
let right = VecDocSet::from(vec![0]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
let mut intersection = Intersection::new(vec![left, right], 10);
|
||||
assert_eq!(intersection.doc(), 0);
|
||||
assert_eq!(intersection.advance(), TERMINATED);
|
||||
}
|
||||
@@ -198,7 +237,7 @@ mod tests {
|
||||
fn test_intersection_skip() {
|
||||
let left = VecDocSet::from(vec![0, 1, 2, 4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
let mut intersection = Intersection::new(vec![left, right], 10);
|
||||
assert_eq!(intersection.seek(2), 2);
|
||||
assert_eq!(intersection.doc(), 2);
|
||||
}
|
||||
@@ -209,7 +248,7 @@ mod tests {
|
||||
|| {
|
||||
let left = VecDocSet::from(vec![4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
Box::new(Intersection::new(vec![left, right]))
|
||||
Box::new(Intersection::new(vec![left, right], 10))
|
||||
},
|
||||
vec![0, 2, 4, 5, 6],
|
||||
);
|
||||
@@ -219,19 +258,22 @@ mod tests {
|
||||
let mut right = VecDocSet::from(vec![2, 5, 10]);
|
||||
left.advance();
|
||||
right.advance();
|
||||
Box::new(Intersection::new(vec![left, right]))
|
||||
Box::new(Intersection::new(vec![left, right], 10))
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
Box::new(Intersection::new(vec![
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 2, 5, 6]),
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 5, 6]),
|
||||
VecDocSet::from(vec![2, 4, 5, 7, 8]),
|
||||
]))
|
||||
Box::new(Intersection::new(
|
||||
vec![
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 2, 5, 6]),
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 5, 6]),
|
||||
VecDocSet::from(vec![2, 4, 5, 7, 8]),
|
||||
],
|
||||
10,
|
||||
))
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
@@ -242,7 +284,41 @@ mod tests {
|
||||
let a = VecDocSet::from(vec![1, 3]);
|
||||
let b = VecDocSet::from(vec![1, 4]);
|
||||
let c = VecDocSet::from(vec![3, 9]);
|
||||
let intersection = Intersection::new(vec![a, b, c]);
|
||||
let intersection = Intersection::new(vec![a, b, c], 10);
|
||||
assert_eq!(intersection.doc(), TERMINATED);
|
||||
}
|
||||
|
||||
// Strategy to generate sorted and deduplicated vectors of u32 document IDs
|
||||
fn sorted_deduped_vec(max_val: u32, max_size: usize) -> impl Strategy<Value = Vec<u32>> {
|
||||
prop::collection::vec(0..max_val, 0..max_size).prop_map(|mut vec| {
|
||||
vec.sort();
|
||||
vec.dedup();
|
||||
vec
|
||||
})
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn prop_test_intersection_consistency(
|
||||
a in sorted_deduped_vec(100, 10),
|
||||
b in sorted_deduped_vec(100, 10),
|
||||
num_docs in 100u32..500u32
|
||||
) {
|
||||
let left = VecDocSet::from(a.clone());
|
||||
let right = VecDocSet::from(b.clone());
|
||||
let mut intersection = Intersection::new(vec![left, right], num_docs);
|
||||
|
||||
let expected: Vec<u32> = a.iter()
|
||||
.cloned()
|
||||
.filter(|doc| b.contains(doc))
|
||||
.collect();
|
||||
|
||||
for expected_doc in expected {
|
||||
assert_eq!(intersection.doc(), expected_doc);
|
||||
intersection.advance();
|
||||
}
|
||||
assert_eq!(intersection.doc(), TERMINATED);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ mod regex_query;
|
||||
mod reqopt_scorer;
|
||||
mod scorer;
|
||||
mod set_query;
|
||||
mod size_hint;
|
||||
mod term_query;
|
||||
mod union;
|
||||
mod weight;
|
||||
|
||||
@@ -193,6 +193,14 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
if self.phrase_scorer.seek_exact(target) {
|
||||
self.matches_prefix()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.phrase_scorer.doc()
|
||||
}
|
||||
@@ -200,6 +208,10 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.phrase_scorer.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.phrase_scorer.cost()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
|
||||
|
||||
@@ -368,6 +368,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
slop: u32,
|
||||
offset: usize,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let num_docs = fieldnorm_reader.num_docs();
|
||||
let max_offset = term_postings_with_offset
|
||||
.iter()
|
||||
.map(|&(offset, _)| offset)
|
||||
@@ -381,8 +382,9 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let intersection_docset = Intersection::new(postings_with_offsets, num_docs);
|
||||
let mut scorer = PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
intersection_docset,
|
||||
num_terms: num_docsets,
|
||||
left_positions: Vec::with_capacity(100),
|
||||
right_positions: Vec::with_capacity(100),
|
||||
@@ -528,12 +530,35 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
debug_assert!(target >= self.doc());
|
||||
if self.intersection_docset.seek_exact(target) && self.phrase_match() {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.intersection_docset.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.intersection_docset.size_hint()
|
||||
// We adjust the intersection estimate, since actual phrase hits are much lower than where
|
||||
// the all appear.
|
||||
// The estimate should depend on average field length, e.g. if the field is really short
|
||||
// a phrase hit is more likely
|
||||
self.intersection_docset.size_hint() / (10 * self.num_terms as u32)
|
||||
}
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// cost to drive the docset.
|
||||
fn cost(&self) -> u64 {
|
||||
// While determing a potential hit is cheap for phrases, evaluating an actual hit is
|
||||
// expensive since it requires to load positions for a doc and check if they are next to
|
||||
// each other.
|
||||
// So the cost estimation would be the number of times we need to check if a doc is a hit *
|
||||
// 10 * self.num_terms.
|
||||
self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,9 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
|
||||
|
||||
/// Returns true if more data could be fetched
|
||||
fn fetch_block(&mut self) {
|
||||
if self.next_fetch_start >= self.column.num_docs() {
|
||||
return;
|
||||
}
|
||||
const MAX_HORIZON: u32 = 100_000;
|
||||
while self.loaded_docs.is_empty() {
|
||||
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
|
||||
@@ -105,10 +108,10 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
|
||||
fn fetch_horizon(&mut self, horizon: u32) -> bool {
|
||||
let mut finished_to_end = false;
|
||||
|
||||
let limit = self.column.num_docs();
|
||||
let mut end = self.next_fetch_start + horizon;
|
||||
if end >= limit {
|
||||
end = limit;
|
||||
let num_docs = self.column.num_docs();
|
||||
let mut fetch_end = self.next_fetch_start + horizon;
|
||||
if fetch_end >= num_docs {
|
||||
fetch_end = num_docs;
|
||||
finished_to_end = true;
|
||||
}
|
||||
|
||||
@@ -116,7 +119,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
|
||||
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
|
||||
self.column.get_docids_for_value_range(
|
||||
self.value_range.clone(),
|
||||
self.next_fetch_start..end,
|
||||
self.next_fetch_start..fetch_end,
|
||||
doc_buffer,
|
||||
);
|
||||
if let Some(last_doc) = last_doc {
|
||||
@@ -124,7 +127,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
|
||||
self.loaded_docs.next();
|
||||
}
|
||||
}
|
||||
self.next_fetch_start = end;
|
||||
self.next_fetch_start = fetch_end;
|
||||
|
||||
finished_to_end
|
||||
}
|
||||
@@ -136,9 +139,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
if let Some(docid) = self.loaded_docs.next() {
|
||||
return docid;
|
||||
}
|
||||
if self.next_fetch_start >= self.column.num_docs() {
|
||||
return TERMINATED;
|
||||
}
|
||||
self.fetch_block();
|
||||
self.loaded_docs.current().unwrap_or(TERMINATED)
|
||||
}
|
||||
@@ -174,7 +174,18 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.column.num_docs()
|
||||
// TODO: Implement a better size hint
|
||||
self.column.num_docs() / 10
|
||||
}
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// cost to drive the docset.
|
||||
fn cost(&self) -> u64 {
|
||||
// Advancing the docset is pretty expensive since it scans the whole column, there is no
|
||||
// index currently (will change with an kd-tree)
|
||||
// Since we use SIMD to scan the fast field range query we lower the cost a little bit.
|
||||
// Ideally this would take the fast field codec into account
|
||||
(self.column.num_docs() as f64 * 0.8) as u64
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1574,449 +1574,3 @@ pub(crate) mod ip_range_tests {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use test::Bencher;
|
||||
|
||||
use super::tests::*;
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::Index;
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id_name = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"many".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id_name,
|
||||
id: rng.gen_range(0..100),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_docs(&docs, false)
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<u64> {
|
||||
0..=90
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<u64> {
|
||||
0..=10
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<u64> {
|
||||
10..=10
|
||||
}
|
||||
|
||||
fn execute_query(
|
||||
field: &str,
|
||||
id_range: RangeInclusive<u64>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
let gen_query_inclusive = |from: &u64, to: &u64| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(id_range.start(), id_range.end());
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(&query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query, &(Count)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_ip {
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use test::Bencher;
|
||||
|
||||
use super::ip_range_tests::*;
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::Index;
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"many".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id,
|
||||
// Multiply by 1000, so that we create many buckets in the compact space
|
||||
// The benches depend on this range to select n-percent of elements with the
|
||||
// methods below.
|
||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_ip_docs(&docs)
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn execute_query(
|
||||
field: &str,
|
||||
ip_range: RangeInclusive<Ipv6Addr>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(&query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query, &(Count)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| execute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +56,11 @@ where
|
||||
self.req_scorer.seek(target)
|
||||
}
|
||||
|
||||
fn seek_exact(&mut self, target: DocId) -> bool {
|
||||
self.score_cache = None;
|
||||
self.req_scorer.seek_exact(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.req_scorer.doc()
|
||||
}
|
||||
@@ -63,6 +68,10 @@ where
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.req_scorer.size_hint()
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.req_scorer.cost()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TReqScorer, TOptScorer, TScoreCombiner> Scorer
|
||||
|
||||
138
src/query/size_hint.rs
Normal file
138
src/query/size_hint.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
/// Computes the estimated number of documents in the intersection of multiple docsets
|
||||
/// given their sizes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
|
||||
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
|
||||
/// segment.
|
||||
///
|
||||
/// # Returns
|
||||
/// The estimated number of documents in the intersection.
|
||||
pub fn estimate_intersection<I>(mut docset_sizes: I, max_docs: u32) -> u32
|
||||
where I: Iterator<Item = u32> {
|
||||
// Terms tend to be not really randomly distributed.
|
||||
// This factor is used to adjust the estimate.
|
||||
let mut co_loc_factor: f64 = 1.3;
|
||||
|
||||
let mut intersection_estimate = match docset_sizes.next() {
|
||||
Some(first_size) => first_size as f64,
|
||||
None => return 0, // No docsets provided, so return 0.
|
||||
};
|
||||
|
||||
let mut smallest_docset_size = intersection_estimate;
|
||||
// Assuming random distribution of terms, the probability of a document being in the
|
||||
// intersection
|
||||
for size in docset_sizes {
|
||||
// Diminish the co-location factor for each additional set, or we will overestimate.
|
||||
co_loc_factor = (co_loc_factor - 0.1).max(1.0);
|
||||
intersection_estimate *= (size as f64 / max_docs as f64) * co_loc_factor;
|
||||
smallest_docset_size = smallest_docset_size.min(size as f64);
|
||||
}
|
||||
|
||||
intersection_estimate.round().min(smallest_docset_size) as u32
|
||||
}
|
||||
|
||||
/// Computes the estimated number of documents in the union of multiple docsets
|
||||
/// given their sizes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set).
|
||||
/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the
|
||||
/// segment.
|
||||
///
|
||||
/// # Returns
|
||||
/// The estimated number of documents in the union.
|
||||
pub fn estimate_union<I>(docset_sizes: I, max_docs: u32) -> u32
|
||||
where I: Iterator<Item = u32> {
|
||||
// Terms tend to be not really randomly distributed.
|
||||
// This factor is used to adjust the estimate.
|
||||
// Unlike intersection, the co-location reduces the estimate.
|
||||
let co_loc_factor = 0.8;
|
||||
|
||||
// The approach for union is to compute the probability of a document not being in any of the
|
||||
// sets
|
||||
let mut not_in_any_set_prob = 1.0;
|
||||
|
||||
// Assuming random distribution of terms, the probability of a document being in the
|
||||
// union is the complement of the probability of it not being in any of the sets.
|
||||
for size in docset_sizes {
|
||||
let prob_in_set = (size as f64 / max_docs as f64) * co_loc_factor;
|
||||
not_in_any_set_prob *= 1.0 - prob_in_set;
|
||||
}
|
||||
|
||||
let union_estimate = (max_docs as f64 * (1.0 - not_in_any_set_prob)).round();
|
||||
|
||||
union_estimate.min(u32::MAX as f64) as u32
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_estimate_intersection_small1() {
|
||||
let docset_sizes = &[500, 1000];
|
||||
let n = 10_000;
|
||||
let result = estimate_intersection(docset_sizes.iter().copied(), n);
|
||||
assert_eq!(result, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_intersection_small2() {
|
||||
let docset_sizes = &[500, 1000, 1500];
|
||||
let n = 10_000;
|
||||
let result = estimate_intersection(docset_sizes.iter().copied(), n);
|
||||
assert_eq!(result, 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_intersection_large_values() {
|
||||
let docset_sizes = &[100_000, 50_000, 30_000];
|
||||
let n = 1_000_000;
|
||||
let result = estimate_intersection(docset_sizes.iter().copied(), n);
|
||||
assert_eq!(result, 198);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_union_small() {
|
||||
let docset_sizes = &[500, 1000, 1500];
|
||||
let n = 10000;
|
||||
let result = estimate_union(docset_sizes.iter().copied(), n);
|
||||
assert_eq!(result, 2228);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_union_large_values() {
|
||||
let docset_sizes = &[100000, 50000, 30000];
|
||||
let n = 1000000;
|
||||
let result = estimate_union(docset_sizes.iter().copied(), n);
|
||||
assert_eq!(result, 137997);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_intersection_large() {
|
||||
let docset_sizes: Vec<_> = (0..10).map(|_| 4_000_000).collect();
|
||||
let n = 5_000_000;
|
||||
let result = estimate_intersection(docset_sizes.iter().copied(), n);
|
||||
// Check that it doesn't overflow and returns a reasonable result
|
||||
assert_eq!(result, 708_670);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_intersection_overflow_safety() {
|
||||
let docset_sizes: Vec<_> = (0..100).map(|_| 4_000_000).collect();
|
||||
let n = 5_000_000;
|
||||
let result = estimate_intersection(docset_sizes.iter().copied(), n);
|
||||
// Check that it doesn't overflow and returns a reasonable result
|
||||
assert_eq!(result, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_estimate_union_overflow_safety() {
|
||||
let docset_sizes: Vec<_> = (0..100).map(|_| 1_000_000).collect();
|
||||
let n = 20_000_000;
|
||||
let result = estimate_union(docset_sizes.iter().copied(), n);
|
||||
// Check that it doesn't overflow and returns a reasonable result
|
||||
assert_eq!(result, 19_662_594);
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ use common::TinySet;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::size_hint::estimate_union;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -50,6 +51,7 @@ pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
doc: DocId,
|
||||
/// Combined score for current `doc` as produced by `TScoreCombiner`.
|
||||
score: Score,
|
||||
num_docs: u32,
|
||||
}
|
||||
|
||||
fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
||||
@@ -81,6 +83,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
|
||||
pub(crate) fn build(
|
||||
docsets: Vec<TScorer>,
|
||||
score_combiner_fn: impl FnOnce() -> TScoreCombiner,
|
||||
num_docs: u32,
|
||||
) -> BufferedUnionScorer<TScorer, TScoreCombiner> {
|
||||
let non_empty_docsets: Vec<TScorer> = docsets
|
||||
.into_iter()
|
||||
@@ -94,6 +97,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
|
||||
window_start_doc: 0,
|
||||
doc: 0,
|
||||
score: 0.0,
|
||||
num_docs,
|
||||
};
|
||||
if union.refill() {
|
||||
union.advance();
|
||||
@@ -211,20 +215,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
// TODO Also implement `count` with deletes efficiently.
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
estimate_union(self.docsets.iter().map(DocSet::size_hint), self.num_docs)
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.docsets.iter().map(DocSet::cost).sum()
|
||||
}
|
||||
|
||||
// TODO Also implement `count` with deletes efficiently.
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
if self.doc == TERMINATED {
|
||||
return 0;
|
||||
|
||||
@@ -27,11 +27,17 @@ mod tests {
|
||||
docs_list.iter().cloned().map(VecDocSet::from)
|
||||
}
|
||||
fn union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
|
||||
let max_doc = docs_list
|
||||
.iter()
|
||||
.flat_map(|docs| docs.iter().copied())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
Box::new(BufferedUnionScorer::build(
|
||||
vec_doc_set_from_docs_list(docs_list)
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<ConstScorer<VecDocSet>>>(),
|
||||
DoNothingCombiner::default,
|
||||
max_doc,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -273,6 +279,7 @@ mod bench {
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
100_000,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
@@ -294,6 +301,7 @@ mod bench {
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
100_000,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
|
||||
@@ -92,6 +92,7 @@ impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
// TODO: use estimate_union
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
@@ -99,6 +100,10 @@ impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn cost(&self) -> u64 {
|
||||
self.docsets.iter().map(|docset| docset.cost()).sum()
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
if self.doc == TERMINATED {
|
||||
return 0u32;
|
||||
|
||||
Reference in New Issue
Block a user