mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
merged
This commit is contained in:
@@ -20,10 +20,10 @@ pub struct BlockSegmentPostings {
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_freq: u32,
|
||||
|
||||
data: ReadOnlySource,
|
||||
skip_reader: SkipReader,
|
||||
pub(crate) skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn decode_bitpacked_block(
|
||||
@@ -89,7 +89,6 @@ impl BlockSegmentPostings {
|
||||
None => SkipReader::new(ReadOnlySource::empty(), doc_freq, record_option),
|
||||
};
|
||||
|
||||
let doc_freq = doc_freq as usize;
|
||||
let mut block_segment_postings = BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
loaded_offset: std::usize::MAX,
|
||||
@@ -123,14 +122,14 @@ impl BlockSegmentPostings {
|
||||
} else {
|
||||
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
|
||||
}
|
||||
self.doc_freq = doc_freq as usize;
|
||||
self.doc_freq = doc_freq;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
|
||||
@@ -38,8 +38,8 @@ impl SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.block_cursor.doc_freq
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
|
||||
/// Creates a segment postings object with the given documents
|
||||
@@ -143,7 +143,7 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
self.block_cursor.doc_freq() as usize
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::common::BinarySerializable;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, TERMINATED};
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
pub struct SkipSerializer {
|
||||
@@ -102,8 +102,11 @@ impl SkipReader {
|
||||
self.remaining_docs = doc_freq;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn last_doc_in_block(&self) -> DocId {
|
||||
pub fn block_max_score(&self) -> Score {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
pub fn last_doc_in_block(&self) -> DocId {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
@@ -159,7 +162,7 @@ impl SkipReader {
|
||||
/// If the target is larger than all documents, the skip_reader
|
||||
/// then advance to the last Variable In block.
|
||||
pub fn seek(&mut self, target: DocId) {
|
||||
while self.last_doc_in_block < target {
|
||||
while self.last_doc_in_block() < target {
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,73 +1,106 @@
|
||||
use crate::{Score, DocId, TERMINATED, DocSet};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::postings::BlockSegmentPostings;
|
||||
use futures::AsyncSeekExt;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
|
||||
struct BlockWAND {
|
||||
term_scorers : Vec<TermScorer>,
|
||||
}
|
||||
|
||||
|
||||
fn find_pivot_doc(term_scorers: &[TermScorer], threshold: f32) -> DocId {
|
||||
/// Returns the lowest document that has a chance of exceeding the
|
||||
/// threshold score.
|
||||
///
|
||||
/// term_scorers are assumed sorted by .doc().
|
||||
fn find_pivot_doc(term_scorers: &[TermScorer], threshold: f32) -> Option<DocId> {
|
||||
let mut max_score = 0.0f32;
|
||||
for term_scorer in term_scorers.iter() {
|
||||
for term_scorer in term_scorers {
|
||||
max_score += term_scorer.max_score();
|
||||
if max_score > threshold {
|
||||
return term_scorer.doc();
|
||||
return Some(term_scorer.doc());
|
||||
}
|
||||
}
|
||||
TERMINATED
|
||||
None
|
||||
}
|
||||
|
||||
fn shallow_advance(scorers: &mut Vec<TermScorer>, pivot: DocId) -> Score {
|
||||
let mut max_block_score = 0.0f32;
|
||||
let mut i = 0;
|
||||
while i < scorers.len() {
|
||||
if scorers[i].doc() > pivot {
|
||||
let mut block_max_score_upperbound = 0.0f32;
|
||||
for scorer in scorers {
|
||||
if scorer.doc() > pivot {
|
||||
break;
|
||||
}
|
||||
while scorers[i].postings.block_cursor.skip_reader.doc() < pivot {
|
||||
if scorers[i].postings.block_cursor.skip_reader.advance() {
|
||||
max_block_score += scorers[i].postings.block_cursor.skip_reader.block_max_score();
|
||||
i += 1;
|
||||
} else {
|
||||
scorers.swap_remove(i);
|
||||
}
|
||||
}
|
||||
scorer.postings.block_cursor.seek(pivot);
|
||||
block_max_score_upperbound += scorer.postings.block_cursor.skip_reader.block_max_score();
|
||||
}
|
||||
max_block_score
|
||||
block_max_score_upperbound
|
||||
}
|
||||
|
||||
pub fn block_wand(mut scorers: Vec<TermScorer>, mut threshold: f32, callback: &mut dyn FnMut(u32, Score) -> Score) {
|
||||
fn compute_score(scorers: &mut Vec<TermScorer>, doc: DocId) -> Score {
|
||||
let mut i = 0;
|
||||
let mut score = 0.0f32;
|
||||
while i < scorers.len() {
|
||||
if scorers[i].doc() > doc {
|
||||
break;
|
||||
}
|
||||
if scorers[i].seek(doc) == TERMINATED {
|
||||
scorers.swap_remove(i);
|
||||
} else {
|
||||
score += scorers[i].score();
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
score
|
||||
}
|
||||
|
||||
fn advance_all_scorers(scorers: &mut Vec<TermScorer>) {
|
||||
let mut i = 0;
|
||||
while i < scorers.len() {
|
||||
if scorers[i].advance() == TERMINATED {
|
||||
scorers.swap_remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn block_wand(
|
||||
mut scorers: Vec<TermScorer>,
|
||||
mut threshold: f32,
|
||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
||||
) {
|
||||
loop {
|
||||
scorers.sort_by_key(|scorer| scorer.doc());
|
||||
let pivot_doc = find_pivot_doc(&scorers, threshold);
|
||||
if pivot_doc == TERMINATED {
|
||||
return;
|
||||
}
|
||||
if shallow_advance(&mut scorers, pivot_doc) > threshold {
|
||||
if scorers[0].doc() == pivot_doc {
|
||||
// EvaluatePartial(d , p);
|
||||
// Move all pointers from lists[0] to lists[p] by calling
|
||||
// Next(list, d + 1)
|
||||
} else {
|
||||
let scorer_id = scorers.iter_mut()
|
||||
.take_while(|term_scorer| term_scorer.doc() < pivot_doc)
|
||||
let pivot_opt = find_pivot_doc(&scorers, threshold);
|
||||
if let Some(pivot_doc) = pivot_opt {
|
||||
let block_max_score_upperbound = shallow_advance(&mut scorers, pivot_doc);
|
||||
// TODO bug: more than one scorer can point on the pivot.
|
||||
if block_max_score_upperbound <= threshold {
|
||||
// TODO choose a better candidate.
|
||||
if scorers[0].seek(pivot_doc + 1) == TERMINATED {
|
||||
scorers.swap_remove(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if scorers[0].doc() != pivot_doc {
|
||||
// all scorers are not aligned on pivot_doc.
|
||||
if let Some(scorer_ord) = scorers
|
||||
.iter_mut()
|
||||
.take_while(|scorer| scorer.doc() < pivot_doc)
|
||||
.enumerate()
|
||||
.min_by_key(|(scorer_id, scorer)| scorer.doc_freq())
|
||||
.map(|(scorer_id, scorer)| scorer_id)
|
||||
.unwrap();
|
||||
if scorers[scorer_id].seek(pivot_doc) == TERMINATED {
|
||||
scorers.swap_remove(scorer_id);
|
||||
.min_by_key(|(_ord, scorer)| scorer.doc_freq())
|
||||
.map(|(ord, _scorer)| ord)
|
||||
{
|
||||
// TODOD FIX seek, right now the block will never get loaded.
|
||||
if scorers[scorer_ord].seek(pivot_doc) == TERMINATED {
|
||||
scorers.swap_remove(scorer_ord);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO no need to fully score?
|
||||
let score = compute_score(&mut scorers, pivot_doc);
|
||||
if score > threshold {
|
||||
threshold = callback(pivot_doc, score);
|
||||
}
|
||||
advance_all_scorers(&mut scorers);
|
||||
} else {
|
||||
//d = GetNewCandidate();
|
||||
//Choose one list from the lists before and including lists[p]
|
||||
//with the largest IDF, move it by calling Next(list, d)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,9 +7,9 @@ use crate::query::Exclude;
|
||||
use crate::query::Occur;
|
||||
use crate::query::RequiredOptionalScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::{Union, TermUnion};
|
||||
use crate::query::Weight;
|
||||
use crate::query::{intersect_scorers, Explanation};
|
||||
use crate::query::{TermUnion, Union};
|
||||
use crate::DocId;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -29,8 +29,7 @@ where
|
||||
.into_iter()
|
||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
let scorer: Box<dyn Scorer> =
|
||||
Box::new(TermUnion::<TScoreCombiner>::from(scorers));
|
||||
let scorer: Box<dyn Scorer> = Box::new(TermUnion::<TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
mod block_wand;
|
||||
mod boolean_query;
|
||||
mod boolean_weight;
|
||||
mod block_wand;
|
||||
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub(crate) use self::block_wand::block_wand;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -27,7 +27,7 @@ mod vec_docset;
|
||||
|
||||
pub(crate) mod score_combiner;
|
||||
pub use self::intersection::Intersection;
|
||||
pub use self::union::{Union, TermUnion};
|
||||
pub use self::union::{TermUnion, Union};
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
|
||||
@@ -33,8 +33,8 @@ impl TermScorer {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
|
||||
pub fn doc_freq(&self,) -> usize {
|
||||
self.postings.doc_freq()
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.postings.doc_freq() as usize
|
||||
}
|
||||
|
||||
pub fn fieldnorm_id(&self) -> u8 {
|
||||
@@ -47,7 +47,7 @@ impl TermScorer {
|
||||
self.similarity_weight.explain(fieldnorm_id, term_freq)
|
||||
}
|
||||
|
||||
pub fn max_score(&self, ) -> f32 {
|
||||
pub fn max_score(&self) -> f32 {
|
||||
unimplemented!();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use crate::common::TinySet;
|
||||
use crate::query::boolean_query::block_wand;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::boolean_query::block_wand;
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
@@ -41,8 +41,6 @@ pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
score: Score,
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -208,7 +206,11 @@ where
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets.iter().map(|docset| docset.size_hint()).max().unwrap_or(0u32)
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
@@ -234,7 +236,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -246,13 +247,13 @@ where
|
||||
}
|
||||
|
||||
pub struct TermUnion<TScoreCombiner> {
|
||||
underlying: Union<TermScorer, TScoreCombiner>
|
||||
underlying: Union<TermScorer, TScoreCombiner>,
|
||||
}
|
||||
|
||||
impl<TScoreCombiner: ScoreCombiner> From<Vec<TermScorer>> for TermUnion<TScoreCombiner> {
|
||||
fn from(scorers: Vec<TermScorer>) -> Self {
|
||||
TermUnion {
|
||||
underlying: Union::from(scorers)
|
||||
underlying: Union::from(scorers),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,7 +268,7 @@ impl<TScoreCombiner: ScoreCombiner> DocSet for TermUnion<TScoreCombiner> {
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [u32]) -> usize {
|
||||
self.underlying.fill_buffer(buffer)
|
||||
self.underlying.fill_buffer(buffer)
|
||||
}
|
||||
|
||||
fn doc(&self) -> u32 {
|
||||
@@ -275,30 +276,33 @@ impl<TScoreCombiner: ScoreCombiner> DocSet for TermUnion<TScoreCombiner> {
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.underlying.size_hint()
|
||||
self.underlying.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
self.underlying.count(delete_bitset)
|
||||
self.underlying.count(delete_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
self.underlying.count_including_deleted()
|
||||
self.underlying.count_including_deleted()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScoreCombiner: ScoreCombiner> Scorer for TermUnion<TScoreCombiner> {
|
||||
fn score(&mut self) -> f32 {
|
||||
self.underlying.score()
|
||||
self.underlying.score()
|
||||
}
|
||||
|
||||
fn for_each_pruning(&mut self, mut threshold: f32, callback: &mut dyn FnMut(DocId, Score) -> Score) {
|
||||
fn for_each_pruning(
|
||||
&mut self,
|
||||
threshold: f32,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
let term_scorers = std::mem::replace(&mut self.underlying.docsets, vec![]);
|
||||
block_wand(term_scorers, threshold, callback);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user