mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 10:32:55 +00:00
TermScorer does not handle deletes
This commit is contained in:
@@ -4,11 +4,9 @@ use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use postings::TermInfo;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use fastfield::DeleteBitSet;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use common::BinarySerializable;
|
||||
use postings::{DeleteSet, NoDelete};
|
||||
use schema::FieldType;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
@@ -28,7 +26,6 @@ pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64
|
||||
}
|
||||
@@ -38,7 +35,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
record_option: IndexRecordOption,
|
||||
) -> InvertedIndexReader {
|
||||
let total_num_tokens_data = postings_source.slice(0, 8);
|
||||
@@ -48,7 +44,6 @@ impl InvertedIndexReader {
|
||||
termdict,
|
||||
postings_source: postings_source.slice_from(8),
|
||||
positions_source,
|
||||
delete_bitset_opt,
|
||||
record_option,
|
||||
total_num_tokens
|
||||
}
|
||||
@@ -64,7 +59,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::empty(field_type),
|
||||
postings_source: ReadOnlySource::empty(),
|
||||
positions_source: ReadOnlySource::empty(),
|
||||
delete_bitset_opt: None,
|
||||
record_option,
|
||||
total_num_tokens: 0u64
|
||||
}
|
||||
@@ -129,15 +123,12 @@ impl InvertedIndexReader {
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo<TDeleteSet: DeleteSet>(
|
||||
pub fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> SegmentPostings<TDeleteSet> {
|
||||
) -> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_set = TDeleteSet::from(self.delete_bitset_opt.iter()
|
||||
.cloned()
|
||||
.next());
|
||||
let position_stream = {
|
||||
if option.has_positions() {
|
||||
let position_offset = term_info.positions_offset;
|
||||
@@ -149,7 +140,7 @@ impl InvertedIndexReader {
|
||||
None
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_block_postings(block_postings, delete_set, position_stream)
|
||||
SegmentPostings::from_block_postings(block_postings, position_stream)
|
||||
}
|
||||
|
||||
/// Returns the total number of tokens recorded for all documents
|
||||
@@ -170,12 +161,12 @@ impl InvertedIndexReader {
|
||||
/// For instance, requesting `IndexRecordOption::Freq` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings<DeleteBitSet>> {
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings<NoDelete>> {
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use postings::DeleteSet;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -285,7 +284,6 @@ impl SegmentReader {
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset_opt.clone(),
|
||||
record_option,
|
||||
));
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
use postings::DeleteSet;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -52,19 +51,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl DeleteSet for DeleteBitSet {
|
||||
|
||||
fn empty() -> DeleteBitSet {
|
||||
DeleteBitSet {
|
||||
data: ReadOnlySource::empty(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_deleted(&self, doc: DocId) -> bool {
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
} else {
|
||||
@@ -78,16 +65,6 @@ impl DeleteSet for DeleteBitSet {
|
||||
}
|
||||
|
||||
|
||||
impl From<Option<DeleteBitSet>> for DeleteBitSet {
|
||||
fn from(delete_bitset_opt: Option<DeleteBitSet>) -> Self {
|
||||
if let Some(delete_bitset) = delete_bitset_opt {
|
||||
delete_bitset
|
||||
} else {
|
||||
DeleteBitSet::empty()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
|
||||
@@ -19,7 +19,6 @@ use termdict::TermStreamer;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use postings::DeleteSet;
|
||||
use postings::Postings;
|
||||
|
||||
|
||||
@@ -296,12 +295,13 @@ impl IndexMerger {
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
let inverted_index = segment_reader.inverted_index(indexed_field);
|
||||
let mut segment_postings = inverted_index
|
||||
.read_postings_from_terminfo::<DeleteBitSet>(term_info, segment_postings_option);
|
||||
if segment_postings.advance() {
|
||||
Some((segment_ord, segment_postings))
|
||||
} else {
|
||||
None
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||
while segment_postings.advance() {
|
||||
if !segment_reader.is_deleted(segment_postings.doc()) {
|
||||
return Some((segment_ord, segment_postings));
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -309,7 +309,6 @@ impl IndexMerger {
|
||||
// of all of the segments containing the given term.
|
||||
//
|
||||
// These segments are non-empty and advance has already been called.
|
||||
|
||||
if !segment_postings.is_empty() {
|
||||
// If not, the `term` will be entirely removed.
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
pub mod index_writer;
|
||||
pub mod segment_serializer;
|
||||
pub mod merger;
|
||||
mod merge_policy;
|
||||
pub mod merge_policy;
|
||||
mod log_merge_policy;
|
||||
mod segment_register;
|
||||
mod segment_writer;
|
||||
|
||||
45
src/lib.rs
45
src/lib.rs
@@ -292,8 +292,7 @@ mod tests {
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
|
||||
}
|
||||
@@ -460,6 +459,16 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_postings1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -525,19 +534,19 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -569,19 +578,19 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -612,25 +621,25 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
use fastfield::DeleteBitSet;
|
||||
use DocId;
|
||||
|
||||
pub trait DeleteSet: 'static + From<Option<DeleteBitSet>> {
|
||||
fn is_deleted(&self, doc: DocId) -> bool;
|
||||
fn empty() -> Self;
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NoDelete;
|
||||
impl DeleteSet for NoDelete {
|
||||
#[inline(always)]
|
||||
fn is_deleted(&self, _doc: DocId) -> bool {
|
||||
false
|
||||
}
|
||||
fn empty() -> Self {
|
||||
NoDelete
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<DeleteBitSet>> for NoDelete {
|
||||
fn from(delete_bitset_opt: Option<DeleteBitSet>) -> Self {
|
||||
assert!(delete_bitset_opt.is_none(), "NoDelete should not be used if there are some deleted documents.");
|
||||
NoDelete
|
||||
}
|
||||
}
|
||||
@@ -13,13 +13,11 @@ mod serializer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod segment_postings;
|
||||
mod delete_set;
|
||||
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
|
||||
pub use self::delete_set::{DeleteSet, NoDelete};
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
@@ -402,11 +400,9 @@ pub mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.delete_term(term_0);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
@@ -418,8 +414,9 @@ pub mod tests {
|
||||
.unwrap();
|
||||
|
||||
if i % 2 == 0 {
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::OverStep);
|
||||
assert_eq!(segment_postings.doc(), i + 1);
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.doc(), i);
|
||||
assert!(segment_reader.is_deleted(i));
|
||||
} else {
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.doc(), i);
|
||||
@@ -453,7 +450,7 @@ pub mod tests {
|
||||
// delete everything else
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.delete_term(term_1);
|
||||
index_writer.delete_term(term_1);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -469,7 +466,9 @@ pub mod tests {
|
||||
.read_postings(&term_2, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
|
||||
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.doc(), 0);
|
||||
assert!(segment_reader.is_deleted(0));
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
|
||||
@@ -7,7 +7,6 @@ use postings::Postings;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use postings::{NoDelete, DeleteSet};
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
@@ -53,20 +52,18 @@ impl PositionComputer {
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<TDeleteSet: DeleteSet> {
|
||||
pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
delete_bitset: TDeleteSet,
|
||||
position_computer: Option<PositionComputer>,
|
||||
}
|
||||
|
||||
impl SegmentPostings<NoDelete> {
|
||||
impl SegmentPostings {
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> Self {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: NoDelete,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
}
|
||||
@@ -80,7 +77,7 @@ impl SegmentPostings<NoDelete> {
|
||||
/// It serializes the doc ids using tantivy's codec
|
||||
/// and returns a `SegmentPostings` object that embeds a
|
||||
/// buffer with the serialized data.
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings<NoDelete> {
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
let mut counting_writer = CountingWriter::wrap(Vec::new());
|
||||
{
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut counting_writer, false);
|
||||
@@ -96,13 +93,11 @@ impl SegmentPostings<NoDelete> {
|
||||
SourceRead::from(data),
|
||||
FreqReadingOption::NoFreq,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, NoDelete, None)
|
||||
SegmentPostings::from_block_postings(block_segment_postings, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDeleteSet: DeleteSet> SegmentPostings<TDeleteSet> {
|
||||
|
||||
|
||||
impl SegmentPostings {
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
@@ -112,13 +107,11 @@ impl<TDeleteSet: DeleteSet> SegmentPostings<TDeleteSet> {
|
||||
/// frequencies and/or positions
|
||||
pub fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings,
|
||||
delete_bitset: TDeleteSet,
|
||||
positions_stream_opt: Option<CompressedIntStream>,
|
||||
) -> SegmentPostings<TDeleteSet> {
|
||||
) -> SegmentPostings {
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
delete_bitset,
|
||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||
}
|
||||
}
|
||||
@@ -142,9 +135,9 @@ fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usi
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
impl DocSet for SegmentPostings {
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.adv ance() {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
if self.doc() == target {
|
||||
@@ -188,42 +181,33 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
break;
|
||||
}
|
||||
}
|
||||
{
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
let (mut start, end) = exponential_search(target, self.cur, block_docs);
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
start += block_docs[start..end]
|
||||
.binary_search(&target)
|
||||
.unwrap_or_else(|e| e);
|
||||
let (mut start, end) = exponential_search(target, self.cur, block_docs);
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = block_docs[start];
|
||||
debug_assert!(doc >= target);
|
||||
start += block_docs[start..end]
|
||||
.binary_search(&target)
|
||||
.unwrap_or_else(|e| e);
|
||||
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs as usize);
|
||||
}
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = block_docs[start];
|
||||
debug_assert!(doc >= target);
|
||||
|
||||
self.cur = start;
|
||||
|
||||
if !self.delete_bitset.is_deleted(doc) {
|
||||
if doc == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs as usize);
|
||||
}
|
||||
if self.advance() {
|
||||
SkipResult::OverStep
|
||||
|
||||
self.cur = start;
|
||||
if doc == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
SkipResult::End
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -232,25 +216,19 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
{
|
||||
if self.position_computer.is_some() {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||
}
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
if self.position_computer.is_some() {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
@@ -285,13 +263,13 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
}
|
||||
|
||||
|
||||
impl<TDeleteSet: DeleteSet> HasLen for SegmentPostings<TDeleteSet> {
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
|
||||
impl Postings for SegmentPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use Result;
|
||||
use query::intersect_scorers;
|
||||
use query::term_query::{TermScorerWithDeletes, TermScorerNoDeletes};
|
||||
use query::term_query::TermScorer;
|
||||
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
|
||||
@@ -27,32 +27,18 @@ where
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorerWithDeletes>::is_type(scorer_ref)
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorerWithDeletes> = scorers
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorerWithDeletes>::downcast(scorer).unwrap())
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorerWithDeletes, TScoreCombiner>::from(scorers);
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorerNoDeletes>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorerNoDeletes> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorerNoDeletes>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorerNoDeletes, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ mod tests {
|
||||
use query::QueryParser;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::SumWithCoordsCombiner;
|
||||
use query::term_query::TermScorerNoDeletes;
|
||||
use query::term_query::TermScorer;
|
||||
|
||||
fn aux_test_helper() -> (Index, Field) {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -71,7 +71,7 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorerNoDeletes>::is_type(&*scorer));
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -83,7 +83,7 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<TermScorerNoDeletes>>::is_type(&*scorer));
|
||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||
@@ -111,7 +111,7 @@ mod tests {
|
||||
let weight = query.weight(&*searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
println!("{:?}", scorer.type_name());
|
||||
assert!(Downcast::<TermScorerNoDeletes>::is_type(&*scorer));
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ use DocId;
|
||||
use downcast::Downcast;
|
||||
use std::borrow::Borrow;
|
||||
use Score;
|
||||
use query::term_query::{TermScorerNoDeletes, TermScorerWithDeletes};
|
||||
use query::term_query::TermScorer;
|
||||
|
||||
/// Returns the intersection scorer.
|
||||
///
|
||||
@@ -28,10 +28,10 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
{
|
||||
if [&left, &right].into_iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = (*scorer).borrow();
|
||||
Downcast::<TermScorerWithDeletes>::is_type(scorer_ref)
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
}) {
|
||||
let left = *Downcast::<TermScorerWithDeletes>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorerWithDeletes>::downcast(right).unwrap();
|
||||
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
|
||||
return box Intersection {
|
||||
left,
|
||||
right,
|
||||
@@ -40,29 +40,11 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
if [&left, &right].into_iter()
|
||||
.all(|scorer| {
|
||||
let scorer_ref: &Scorer = (*scorer).borrow();
|
||||
Downcast::<TermScorerNoDeletes>::is_type(scorer_ref)
|
||||
}) {
|
||||
let left = *Downcast::<TermScorerNoDeletes>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorerNoDeletes>::downcast(right).unwrap();
|
||||
return box Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
return box Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets
|
||||
}
|
||||
return box Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets
|
||||
}
|
||||
}
|
||||
_ => { unreachable!(); }
|
||||
|
||||
@@ -79,11 +79,9 @@ pub trait Query: fmt::Debug {
|
||||
let _ = segment_search_timer.open("set_segment");
|
||||
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
}
|
||||
let _collection_timer = segment_search_timer.open("collection");
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
{
|
||||
let _collection_timer = segment_search_timer.open("collection");
|
||||
scorer.collect(collector);
|
||||
}
|
||||
scorer.collect(collector, segment_reader.delete_bitset());
|
||||
}
|
||||
}
|
||||
Ok(timer_tree)
|
||||
|
||||
@@ -5,6 +5,7 @@ use docset::{DocSet, SkipResult};
|
||||
use common::BitSet;
|
||||
use std::ops::DerefMut;
|
||||
use downcast;
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
@@ -17,13 +18,23 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
while self.advance() {
|
||||
collector.collect(self.doc(), self.score());
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
|
||||
if let Some(delete_bitset) = delete_bitset_opt {
|
||||
while self.advance() {
|
||||
let doc = self.doc();
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
collector.collect(doc, self.score());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while self.advance() {
|
||||
collector.collect(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Scorer);
|
||||
@@ -34,9 +45,9 @@ impl Scorer for Box<Scorer> {
|
||||
self.deref_mut().score()
|
||||
}
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
|
||||
let scorer = self.deref_mut();
|
||||
scorer.collect(collector);
|
||||
scorer.collect(collector, delete_bitset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +61,7 @@ impl DocSet for EmptyScorer {
|
||||
false
|
||||
}
|
||||
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
panic!(
|
||||
"You may not call .doc() on a scorer \
|
||||
|
||||
@@ -6,16 +6,6 @@ pub use self::term_query::TermQuery;
|
||||
pub use self::term_weight::TermWeight;
|
||||
pub use self::term_scorer::TermScorer;
|
||||
|
||||
use postings::SegmentPostings;
|
||||
use postings::NoDelete;
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
pub(crate) type TermScorerWithDeletes = TermScorer<SegmentPostings<DeleteBitSet>>;
|
||||
pub(crate) type TermScorerNoDeletes = TermScorer<SegmentPostings<NoDelete>>;
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -6,18 +6,19 @@ use query::Scorer;
|
||||
use postings::Postings;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use query::bm25::BM25Weight;
|
||||
use postings::SegmentPostings;
|
||||
|
||||
pub struct TermScorer<TPostings: Postings> {
|
||||
postings: TPostings,
|
||||
pub struct TermScorer {
|
||||
postings: SegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
}
|
||||
|
||||
|
||||
impl<TPostings: Postings> TermScorer<TPostings> {
|
||||
pub fn new(postings: TPostings,
|
||||
impl TermScorer {
|
||||
pub fn new(postings: SegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight) -> TermScorer<TPostings> {
|
||||
similarity_weight: BM25Weight) -> TermScorer {
|
||||
TermScorer {
|
||||
postings,
|
||||
fieldnorm_reader,
|
||||
@@ -26,7 +27,7 @@ impl<TPostings: Postings> TermScorer<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
|
||||
impl DocSet for TermScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
@@ -44,7 +45,7 @@ impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Scorer for TermScorer<TPostings> {
|
||||
impl Scorer for TermScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
let doc = self.doc();
|
||||
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
|
||||
|
||||
@@ -6,8 +6,6 @@ use docset::DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use schema::IndexRecordOption;
|
||||
use super::term_scorer::TermScorer;
|
||||
use fastfield::DeleteBitSet;
|
||||
use postings::NoDelete;
|
||||
use Result;
|
||||
use query::bm25::BM25Weight;
|
||||
|
||||
@@ -24,33 +22,18 @@ impl Weight for TermWeight {
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
if reader.has_deletes() {
|
||||
let postings_opt: Option<SegmentPostings<DeleteBitSet>> =
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(box TermScorer::new(segment_postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight))
|
||||
} else {
|
||||
Ok(box TermScorer::new(
|
||||
SegmentPostings::<NoDelete>::empty(),
|
||||
fieldnorm_reader,
|
||||
similarity_weight))
|
||||
}
|
||||
} else {
|
||||
let postings_opt: Option<SegmentPostings<NoDelete>> =
|
||||
inverted_index.read_postings_no_deletes(&self.term, self.index_record_option);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(box TermScorer::new(segment_postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight))
|
||||
} else {
|
||||
Ok(box TermScorer::new(
|
||||
SegmentPostings::<NoDelete>::empty(),
|
||||
SegmentPostings::empty(),
|
||||
fieldnorm_reader,
|
||||
similarity_weight))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
|
||||
Reference in New Issue
Block a user