mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-03-26 15:10:42 +00:00
Compare commits
2 Commits
storage_ab
...
optim
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d53bad6cac | ||
|
|
d45602fb9a |
@@ -141,6 +141,13 @@ fn main() {
|
||||
0.01,
|
||||
0.15,
|
||||
),
|
||||
(
|
||||
"N=1M, p(a)=1%, p(b)=30%, p(c)=30%".to_string(),
|
||||
1_000_000,
|
||||
0.01,
|
||||
0.30,
|
||||
0.30,
|
||||
),
|
||||
];
|
||||
|
||||
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
|
||||
|
||||
@@ -47,6 +47,9 @@ impl TinySet {
|
||||
TinySet(val)
|
||||
}
|
||||
|
||||
/// An empty `TinySet` constant.
|
||||
pub const EMPTY: TinySet = TinySet(0u64);
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::query::Weight;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
@@ -55,6 +56,15 @@ impl Collector for Count {
|
||||
fn merge_fruits(&self, segment_counts: Vec<usize>) -> crate::Result<usize> {
|
||||
Ok(segment_counts.into_iter().sum())
|
||||
}
|
||||
|
||||
fn collect_segment(
|
||||
&self,
|
||||
weight: &dyn Weight,
|
||||
_segment_ord: u32,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<usize> {
|
||||
Ok(weight.count(reader)? as usize)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
use common::TinySet;
|
||||
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -14,6 +16,12 @@ pub const TERMINATED: DocId = i32::MAX as u32;
|
||||
/// exactly this size as long as we can fill the buffer.
|
||||
pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64;
|
||||
|
||||
/// Number of `TinySet` (64-bit) buckets in a block used by [`DocSet::fill_bitset_block`].
|
||||
pub const BLOCK_NUM_TINYBITSETS: usize = 16;
|
||||
|
||||
/// Number of doc IDs covered by one block: `BLOCK_NUM_TINYBITSETS * 64 = 1024`.
|
||||
pub const BLOCK_WINDOW: u32 = BLOCK_NUM_TINYBITSETS as u32 * 64;
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet: Send {
|
||||
/// Goes to the next element.
|
||||
@@ -160,6 +168,31 @@ pub trait DocSet: Send {
|
||||
self.size_hint() as u64
|
||||
}
|
||||
|
||||
/// Fills a bitmask representing which documents in `[min_doc, min_doc + BLOCK_WINDOW)` are
|
||||
/// present in this docset.
|
||||
///
|
||||
/// The window is divided into `BLOCK_NUM_TINYBITSETS` buckets of 64 docs each.
|
||||
/// Returns the next doc `>= min_doc + BLOCK_WINDOW`, or `TERMINATED` if exhausted.
|
||||
fn fill_bitset_block(
|
||||
&mut self,
|
||||
min_doc: DocId,
|
||||
mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
|
||||
) -> DocId {
|
||||
self.seek(min_doc);
|
||||
let horizon = min_doc + BLOCK_WINDOW;
|
||||
loop {
|
||||
let doc = self.doc();
|
||||
if doc >= horizon {
|
||||
return doc;
|
||||
}
|
||||
let delta = doc - min_doc;
|
||||
mask[(delta / 64) as usize].insert_mut(delta % 64);
|
||||
if self.advance() == TERMINATED {
|
||||
return TERMINATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
@@ -214,6 +247,18 @@ impl DocSet for &mut dyn DocSet {
|
||||
(**self).seek_danger(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
(**self).fill_buffer(buffer)
|
||||
}
|
||||
|
||||
fn fill_bitset_block(
|
||||
&mut self,
|
||||
min_doc: DocId,
|
||||
mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
|
||||
) -> DocId {
|
||||
(**self).fill_bitset_block(min_doc, mask)
|
||||
}
|
||||
|
||||
fn doc(&self) -> u32 {
|
||||
(**self).doc()
|
||||
}
|
||||
@@ -256,6 +301,15 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.fill_buffer(buffer)
|
||||
}
|
||||
|
||||
fn fill_bitset_block(
|
||||
&mut self,
|
||||
min_doc: DocId,
|
||||
mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
|
||||
) -> DocId {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.fill_bitset_block(min_doc, mask)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.doc()
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use common::TinySet;
|
||||
|
||||
use super::size_hint::estimate_intersection;
|
||||
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
|
||||
use crate::docset::{DocSet, SeekDangerResult, BLOCK_NUM_TINYBITSETS, TERMINATED};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{EmptyScorer, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
@@ -17,7 +19,7 @@ use crate::{DocId, Score};
|
||||
/// `size_hint` of the intersection.
|
||||
pub fn intersect_scorers(
|
||||
mut scorers: Vec<Box<dyn Scorer>>,
|
||||
num_docs_segment: u32,
|
||||
segment_num_docs: u32,
|
||||
) -> Box<dyn Scorer> {
|
||||
if scorers.is_empty() {
|
||||
return Box::new(EmptyScorer);
|
||||
@@ -42,14 +44,14 @@ pub fn intersect_scorers(
|
||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
others: scorers,
|
||||
num_docs: num_docs_segment,
|
||||
segment_num_docs,
|
||||
});
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docs: num_docs_segment,
|
||||
segment_num_docs,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -58,7 +60,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
|
||||
left: TDocSet,
|
||||
right: TDocSet,
|
||||
others: Vec<TOtherDocSet>,
|
||||
num_docs: u32,
|
||||
segment_num_docs: u32,
|
||||
}
|
||||
|
||||
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
|
||||
@@ -78,7 +80,10 @@ fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
/// num_docs is the number of documents in the segment.
|
||||
pub(crate) fn new(mut docsets: Vec<TDocSet>, num_docs: u32) -> Intersection<TDocSet, TDocSet> {
|
||||
pub(crate) fn new(
|
||||
mut docsets: Vec<TDocSet>,
|
||||
segment_num_docs: u32,
|
||||
) -> Intersection<TDocSet, TDocSet> {
|
||||
let num_docsets = docsets.len();
|
||||
assert!(num_docsets >= 2);
|
||||
docsets.sort_by_key(|docset| docset.cost());
|
||||
@@ -97,7 +102,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
left,
|
||||
right,
|
||||
others: docsets,
|
||||
num_docs,
|
||||
segment_num_docs,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,7 +219,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
[self.left.size_hint(), self.right.size_hint()]
|
||||
.into_iter()
|
||||
.chain(self.others.iter().map(DocSet::size_hint)),
|
||||
self.num_docs,
|
||||
self.segment_num_docs,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -224,6 +229,91 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
// If there are docsets that are bad at skipping, they should also influence the cost.
|
||||
self.left.cost()
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
const DENSITY_THRESHOLD_INVERSE: u32 = 32;
|
||||
if self
|
||||
.left
|
||||
.size_hint()
|
||||
.saturating_mul(DENSITY_THRESHOLD_INVERSE)
|
||||
< self.segment_num_docs
|
||||
{
|
||||
// Sparse path: if the lead iterator covers less than ~3% of docs,
|
||||
// the block approach wastes time on mostly-empty blocks.
|
||||
self.count_including_deleted_sparse()
|
||||
} else {
|
||||
// Dense approach. We push documents into a block bitset to then
|
||||
// perform count using popcount.
|
||||
self.count_including_deleted_dense()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const EMPTY_BLOCK: [TinySet; BLOCK_NUM_TINYBITSETS] = [TinySet::EMPTY; BLOCK_NUM_TINYBITSETS];
|
||||
|
||||
/// ANDs `other` into `mask` in-place. Returns `true` if the result is all zeros.
|
||||
#[inline]
|
||||
fn and_is_empty(
|
||||
mask: &mut [TinySet; BLOCK_NUM_TINYBITSETS],
|
||||
other: &[TinySet; BLOCK_NUM_TINYBITSETS],
|
||||
) -> bool {
|
||||
let mut all_empty = true;
|
||||
for (m, t) in mask.iter_mut().zip(other.iter()) {
|
||||
*m = m.intersect(*t);
|
||||
all_empty &= m.is_empty();
|
||||
}
|
||||
all_empty
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
|
||||
fn count_including_deleted_sparse(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
count += 1;
|
||||
doc = self.advance();
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Dense block-wise bitmask intersection count.
|
||||
///
|
||||
/// Fills a 1024-doc window from each iterator, ANDs the bitmasks together,
|
||||
/// and popcounts the result. `fill_bitset_block` handles seeking tails forward
|
||||
/// when they lag behind the current block.
|
||||
fn count_including_deleted_dense(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
let mut next_base = self.left.doc();
|
||||
|
||||
while next_base < TERMINATED {
|
||||
let base = next_base;
|
||||
|
||||
// Fill lead bitmask.
|
||||
let mut mask = EMPTY_BLOCK;
|
||||
next_base = next_base.max(self.left.fill_bitset_block(base, &mut mask));
|
||||
|
||||
let mut tail_mask = EMPTY_BLOCK;
|
||||
next_base = next_base.max(self.right.fill_bitset_block(base, &mut tail_mask));
|
||||
|
||||
if and_is_empty(&mut mask, &tail_mask) {
|
||||
continue;
|
||||
}
|
||||
// AND with each additional tail.
|
||||
for other in &mut self.others {
|
||||
let mut other_mask = EMPTY_BLOCK;
|
||||
next_base = next_base.max(other.fill_bitset_block(base, &mut other_mask));
|
||||
if and_is_empty(&mut mask, &other_mask) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for tinyset in &mask {
|
||||
count += tinyset.len();
|
||||
}
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
|
||||
@@ -421,6 +511,82 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn prop_test_count_including_deleted_matches_default(
|
||||
a in sorted_deduped_vec(1200, 400),
|
||||
b in sorted_deduped_vec(1200, 400),
|
||||
c in sorted_deduped_vec(1200, 400),
|
||||
num_docs in 1200u32..2000u32,
|
||||
) {
|
||||
// Compute expected count via set intersection.
|
||||
let expected: u32 = a.iter()
|
||||
.filter(|doc| b.contains(doc) && c.contains(doc))
|
||||
.count() as u32;
|
||||
|
||||
// Test count_including_deleted (dense path).
|
||||
let make_intersection = || {
|
||||
Intersection::new(
|
||||
vec![
|
||||
VecDocSet::from(a.clone()),
|
||||
VecDocSet::from(b.clone()),
|
||||
VecDocSet::from(c.clone()),
|
||||
],
|
||||
num_docs,
|
||||
)
|
||||
};
|
||||
|
||||
let mut intersection = make_intersection();
|
||||
let count = intersection.count_including_deleted();
|
||||
prop_assert_eq!(count, expected,
|
||||
"count_including_deleted mismatch: a={:?}, b={:?}, c={:?}", a, b, c);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_count_including_deleted_two_way() {
|
||||
let left = VecDocSet::from(vec![1, 3, 9]);
|
||||
let right = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = Intersection::new(vec![left, right], 100);
|
||||
assert_eq!(intersection.count_including_deleted(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_count_including_deleted_empty() {
|
||||
let a = VecDocSet::from(vec![1, 3]);
|
||||
let b = VecDocSet::from(vec![1, 4]);
|
||||
let c = VecDocSet::from(vec![3, 9]);
|
||||
let mut intersection = Intersection::new(vec![a, b, c], 100);
|
||||
assert_eq!(intersection.count_including_deleted(), 0);
|
||||
}
|
||||
|
||||
/// Test with enough documents to exercise the dense path (>= num_docs/32).
|
||||
#[test]
|
||||
fn test_count_including_deleted_dense_path() {
|
||||
// Create dense docsets: many docs relative to segment size.
|
||||
let docs_a: Vec<u32> = (0..2000).step_by(2).collect(); // even numbers 0..2000
|
||||
let docs_b: Vec<u32> = (0..2000).step_by(3).collect(); // multiples of 3
|
||||
let expected = docs_a.iter().filter(|d| *d % 3 == 0).count() as u32;
|
||||
|
||||
let a = VecDocSet::from(docs_a);
|
||||
let b = VecDocSet::from(docs_b);
|
||||
let mut intersection = Intersection::new(vec![a, b], 2000);
|
||||
assert_eq!(intersection.count_including_deleted(), expected);
|
||||
}
|
||||
|
||||
/// Test that spans multiple blocks (>1024 docs).
|
||||
#[test]
|
||||
fn test_count_including_deleted_multi_block() {
|
||||
let docs_a: Vec<u32> = (0..5000).collect();
|
||||
let docs_b: Vec<u32> = (0..5000).step_by(7).collect();
|
||||
let expected = docs_b.len() as u32; // all of b is in a
|
||||
|
||||
let a = VecDocSet::from(docs_a);
|
||||
let b = VecDocSet::from(docs_b);
|
||||
let mut intersection = Intersection::new(vec![a, b], 5000);
|
||||
assert_eq!(intersection.count_including_deleted(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_2811_intersection_candidate_should_increase() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -117,6 +117,12 @@ impl DocSet for TermScorer {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
|
||||
// TODO
|
||||
// It is probably possible to optimize fill_bitset_block for TermScorer,
|
||||
// working directly with the blocks, enabling vectorization.
|
||||
// I did not manage to get a performance improvement on Mac ARM,
|
||||
// and do not have access to x86 to investigate.
|
||||
}
|
||||
|
||||
impl Scorer for TermScorer {
|
||||
|
||||
Reference in New Issue
Block a user