Files
tantivy/src/query/union/buffered_union.rs
PSeitz aebae9965d add RegexPhraseQuery (#2516)
* add RegexPhraseQuery

RegexPhraseQuery supports phrase queries with regex. It supports regex
and wildcards. E.g. a query with wildcards:
"b* b* wolf" matches "big bad wolf"
Slop is supported as well:
"b* wolf"~2 matches "big bad wolf"

Regex queries may match a lot of terms where we still need to
keep track which term hit to load the positions.
The phrase query algorithm groups terms by their frequency
together in the union to prefilter groups early.

This PR comes with some new datastructures:

SimpleUnion - A union docset for a list of docsets. It doesn't do any
caching and is therefore well suited for datasets with lots of skipping.
(phrase search, but intersections in general)

LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in
memory. SegmentPostings uses 1840 bytes per instance with its caches,
which is equivalent to 460 docids.
LoadedPostings is used for terms which have less than 100 docs.
LoadedPostings is only used to reduce memory consumption.

BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid
hits and the docsets for positions. The BitSet is the precalculated
union of the docsets
In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion,
before creating a new one.

Renamed Union to BufferedUnionScorer
Added proptests to test different union types.

* cleanup

* use Box instead of Vec

* use RefCell instead of term_freq(&mut)

* remove wildcard mode

* move RefCell to outer

* clippy
2024-10-21 18:29:17 +08:00

242 lines
7.2 KiB
Rust

use common::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::Scorer;
use crate::{DocId, Score};
const HORIZON_NUM_TINYBITSETS: usize = 64;
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
// `drain_filter` is not stable yet.
// This function is similar except that it does is not unstable, and
// it does not keep the original vector ordering.
//
// Also, it does not "yield" any elements.
fn unordered_drain_filter<T, P>(v: &mut Vec<T>, mut predicate: P)
where P: FnMut(&mut T) -> bool {
let mut i = 0;
while i < v.len() {
if predicate(&mut v[i]) {
v.swap_remove(i);
} else {
i += 1;
}
}
}
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
docsets: Vec<TScorer>,
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
scores: Box<[TScoreCombiner; HORIZON as usize]>,
cursor: usize,
offset: DocId,
doc: DocId,
score: Score,
}
fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
scorers: &mut Vec<TScorer>,
bitsets: &mut [TinySet; HORIZON_NUM_TINYBITSETS],
score_combiner: &mut [TScoreCombiner; HORIZON as usize],
min_doc: DocId,
) {
unordered_drain_filter(scorers, |scorer| {
let horizon = min_doc + HORIZON;
loop {
let doc = scorer.doc();
if doc >= horizon {
return false;
}
// add this document
let delta = doc - min_doc;
bitsets[(delta / 64) as usize].insert_mut(delta % 64u32);
score_combiner[delta as usize].update(scorer);
if scorer.advance() == TERMINATED {
// remove the docset, it has been entirely consumed.
return true;
}
}
});
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer, TScoreCombiner> {
pub(crate) fn build(
docsets: Vec<TScorer>,
score_combiner_fn: impl FnOnce() -> TScoreCombiner,
) -> BufferedUnionScorer<TScorer, TScoreCombiner> {
let non_empty_docsets: Vec<TScorer> = docsets
.into_iter()
.filter(|docset| docset.doc() != TERMINATED)
.collect();
let mut union = BufferedUnionScorer {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0,
score: 0.0,
};
if union.refill() {
union.advance();
} else {
union.doc = TERMINATED;
}
union
}
fn refill(&mut self) -> bool {
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
self.offset = min_doc;
self.cursor = 0;
self.doc = min_doc;
refill(
&mut self.docsets,
&mut self.bitsets,
&mut self.scores,
min_doc,
);
true
} else {
false
}
}
fn advance_buffered(&mut self) -> bool {
while self.cursor < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
let delta = val + (self.cursor as u32) * 64;
self.doc = self.offset + delta;
let score_combiner = &mut self.scores[delta as usize];
self.score = score_combiner.score();
score_combiner.clear();
return true;
} else {
self.cursor += 1;
}
}
false
}
}
impl<TScorer, TScoreCombiner> DocSet for BufferedUnionScorer<TScorer, TScoreCombiner>
where
TScorer: Scorer,
TScoreCombiner: ScoreCombiner,
{
fn advance(&mut self) -> DocId {
if self.advance_buffered() {
return self.doc;
}
if !self.refill() {
self.doc = TERMINATED;
return TERMINATED;
}
if !self.advance_buffered() {
return TERMINATED;
}
self.doc
}
fn seek(&mut self, target: DocId) -> DocId {
if self.doc >= target {
return self.doc;
}
let gap = target - self.offset;
if gap < HORIZON {
// Our value is within the buffered horizon.
// Skipping to corresponding bucket.
let new_cursor = gap as usize / 64;
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
obsolete_tinyset.clear();
}
for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
score_combiner.clear();
}
self.cursor = new_cursor;
// Advancing until we reach the end of the bucket
// or we reach a doc greater or equal to the target.
let mut doc = self.doc();
while doc < target {
doc = self.advance();
}
doc
} else {
// clear the buffered info.
for obsolete_tinyset in self.bitsets.iter_mut() {
*obsolete_tinyset = TinySet::empty();
}
for score_combiner in self.scores.iter_mut() {
score_combiner.clear();
}
// The target is outside of the buffered horizon.
// advance all docsets to a doc >= to the target.
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
docset.seek(target);
}
docset.doc() == TERMINATED
});
// at this point all of the docsets
// are positioned on a doc >= to the target.
if !self.refill() {
self.doc = TERMINATED;
return TERMINATED;
}
self.advance()
}
}
// TODO Also implement `count` with deletes efficiently.
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
fn count_including_deleted(&mut self) -> u32 {
if self.doc == TERMINATED {
return 0;
}
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
.iter()
.map(|bitset| bitset.len())
.sum::<u32>()
+ 1;
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
while self.refill() {
count += self.bitsets.iter().map(|bitset| bitset.len()).sum::<u32>();
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
}
self.cursor = HORIZON_NUM_TINYBITSETS;
count
}
}
impl<TScorer, TScoreCombiner> Scorer for BufferedUnionScorer<TScorer, TScoreCombiner>
where
TScoreCombiner: ScoreCombiner,
TScorer: Scorer,
{
fn score(&mut self) -> Score {
self.score
}
}