mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Added specialized SegmentPostings when there are no DeleteSet
This commit is contained in:
@@ -7,6 +7,7 @@ use schema::Term;
|
||||
use fastfield::DeleteBitSet;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::{DeleteSet, NoDelete};
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -25,7 +26,7 @@ pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
record_option: IndexRecordOption,
|
||||
}
|
||||
|
||||
@@ -34,14 +35,14 @@ impl InvertedIndexReader {
|
||||
termdict_source: ReadOnlySource,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
record_option: IndexRecordOption,
|
||||
) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
delete_bitset,
|
||||
delete_bitset_opt,
|
||||
record_option,
|
||||
}
|
||||
}
|
||||
@@ -105,13 +106,15 @@ impl InvertedIndexReader {
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo(
|
||||
pub fn read_postings_from_terminfo<TDeleteSet: DeleteSet>(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> SegmentPostings {
|
||||
) -> SegmentPostings<TDeleteSet> {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
let delete_set = TDeleteSet::from(self.delete_bitset_opt.iter()
|
||||
.cloned()
|
||||
.next());
|
||||
let position_stream = {
|
||||
if option.has_positions() {
|
||||
let position_offset = term_info.positions_offset;
|
||||
@@ -123,7 +126,7 @@ impl InvertedIndexReader {
|
||||
None
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
|
||||
SegmentPostings::from_block_postings(block_postings, delete_set, position_stream)
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
@@ -136,11 +139,17 @@ impl InvertedIndexReader {
|
||||
/// For instance, requesting `IndexRecordOption::Freq` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings<DeleteBitSet>> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings<NoDelete>> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
|
||||
@@ -26,6 +26,7 @@ use schema::Schema;
|
||||
use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
use postings::DeleteSet;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -54,7 +55,7 @@ pub struct SegmentReader {
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_reader: StoreReader,
|
||||
delete_bitset: DeleteBitSet,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@@ -79,7 +80,13 @@ impl SegmentReader {
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.delete_bitset.len() as DocId
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.len() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
@@ -194,12 +201,13 @@ impl SegmentReader {
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
DeleteBitSet::open(delete_data)
|
||||
} else {
|
||||
DeleteBitSet::empty()
|
||||
};
|
||||
let delete_bitset_opt =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
@@ -211,7 +219,7 @@ impl SegmentReader {
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader,
|
||||
delete_bitset,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
schema,
|
||||
})
|
||||
@@ -253,7 +261,7 @@ impl SegmentReader {
|
||||
termdict_source,
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
self.delete_bitset_opt.clone(),
|
||||
record_option,
|
||||
));
|
||||
|
||||
@@ -282,14 +290,16 @@ impl SegmentReader {
|
||||
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn delete_bitset(&self) -> &DeleteBitSet {
|
||||
&self.delete_bitset
|
||||
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
|
||||
self.delete_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
/// Returns true iff the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.delete_bitset.is_deleted(doc)
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
use postings::DeleteSet;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -51,22 +52,19 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty delete bit set.
|
||||
pub fn empty() -> DeleteBitSet {
|
||||
}
|
||||
|
||||
|
||||
impl DeleteSet for DeleteBitSet {
|
||||
|
||||
fn empty() -> DeleteBitSet {
|
||||
DeleteBitSet {
|
||||
data: ReadOnlySource::empty(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the segment has some deleted documents.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.len() > 0
|
||||
}
|
||||
|
||||
/// Returns true iff the document is deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
} else {
|
||||
@@ -76,6 +74,18 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl From<Option<DeleteBitSet>> for DeleteBitSet {
|
||||
fn from(delete_bitset_opt: Option<DeleteBitSet>) -> Self {
|
||||
if let Some(delete_bitset) = delete_bitset_opt {
|
||||
delete_bitset
|
||||
} else {
|
||||
DeleteBitSet::empty()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
|
||||
@@ -17,6 +17,7 @@ use store::StoreWriter;
|
||||
use std::cmp::{max, min};
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use postings::DeleteSet;
|
||||
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
@@ -27,22 +28,28 @@ pub struct IndexMerger {
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &FastFieldReader<u64>,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet,
|
||||
delete_bitset_opt: Option<&DeleteBitSet>,
|
||||
) -> Option<(u64, u64)> {
|
||||
if max_doc == 0 {
|
||||
None
|
||||
} else if !delete_bitset.has_deletes() {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
Some((u64_reader.min_value(), u64_reader.max_value()))
|
||||
} else {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
match delete_bitset_opt {
|
||||
Some(delete_bitset) => {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
|
||||
}
|
||||
None => {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
Some((u64_reader.min_value(), u64_reader.max_value()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,7 +157,7 @@ impl IndexMerger {
|
||||
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
|
||||
&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset(),
|
||||
reader.delete_bitset()
|
||||
) {
|
||||
// the segment has some non-deleted documents
|
||||
min_val = min(min_val, seg_min_val);
|
||||
@@ -181,12 +188,15 @@ impl IndexMerger {
|
||||
|
||||
let mut fast_single_field_serializer =
|
||||
fast_field_serializer.new_u64_fast_field(field, min_val, max_val)?;
|
||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||
for (max_doc, u64_reader, delete_bitset_opt) in u64_readers {
|
||||
for doc_id in 0..max_doc {
|
||||
if !delete_bitset.is_deleted(doc_id) {
|
||||
let val = u64_reader.get(doc_id);
|
||||
fast_single_field_serializer.add_val(val)?;
|
||||
if let Some(ref delete_bitset) = delete_bitset_opt {
|
||||
if delete_bitset.is_deleted(doc_id) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let val = u64_reader.get(doc_id);
|
||||
fast_single_field_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,7 +281,7 @@ impl IndexMerger {
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
let inverted_index = segment_reader.inverted_index(indexed_field);
|
||||
let mut segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||
.read_postings_from_terminfo::<DeleteBitSet>(term_info, segment_postings_option);
|
||||
if segment_postings.advance() {
|
||||
Some((segment_ord, segment_postings))
|
||||
} else {
|
||||
|
||||
27
src/postings/delete_set.rs
Normal file
27
src/postings/delete_set.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use fastfield::DeleteBitSet;
|
||||
use DocId;
|
||||
|
||||
pub trait DeleteSet: 'static + From<Option<DeleteBitSet>> {
|
||||
fn is_deleted(&self, doc: DocId) -> bool;
|
||||
fn empty() -> Self;
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NoDelete;
|
||||
impl DeleteSet for NoDelete {
|
||||
#[inline(always)]
|
||||
fn is_deleted(&self, doc: DocId) -> bool {
|
||||
false
|
||||
}
|
||||
fn empty() -> Self {
|
||||
NoDelete
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<DeleteBitSet>> for NoDelete {
|
||||
fn from(delete_bitset_opt: Option<DeleteBitSet>) -> Self {
|
||||
assert!(delete_bitset_opt.is_none(), "NoDelete should not be used if there are some deleted documents.");
|
||||
NoDelete
|
||||
}
|
||||
}
|
||||
@@ -13,11 +13,13 @@ mod serializer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod segment_postings;
|
||||
mod delete_set;
|
||||
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
|
||||
pub use self::delete_set::{DeleteSet, NoDelete};
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
@@ -42,7 +44,7 @@ pub mod tests {
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use query::{Weight, Scorer};
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use core::SegmentComponent;
|
||||
use indexer::SegmentWriter;
|
||||
@@ -97,53 +99,54 @@ pub mod tests {
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "abc"),
|
||||
IndexRecordOption::WithFreqsAndPositions,
|
||||
);
|
||||
let weight = query.specialized_weight(&*searcher, true);
|
||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||
let term = Term::from_field_text(title, "abc");
|
||||
|
||||
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 1, 2], scorer.postings().positions());
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 1, 2], postings.positions());
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
postings.advance();
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
assert_eq!(postings.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
assert_eq!(postings.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,18 +280,16 @@ pub mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "a"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher, true);
|
||||
term_weight.index_record_option = IndexRecordOption::WithFreqsAndPositions;
|
||||
let segment_reader = &searcher.segment_readers()[0];
|
||||
let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap();
|
||||
assert!(term_scorer.advance());
|
||||
assert_eq!(term_scorer.doc(), 1u32);
|
||||
assert_eq!(term_scorer.postings().positions(), &[1u32, 4]);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut postings = segment_reader
|
||||
.inverted_index(text_field)
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 1u32);
|
||||
assert_eq!(postings.positions(), &[1u32, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -8,6 +8,7 @@ use docset::{DocSet, SkipResult};
|
||||
use std::cmp;
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use postings::{NoDelete, DeleteSet};
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::cell::UnsafeCell;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
@@ -66,14 +67,25 @@ impl PositionComputer {
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings {
|
||||
pub struct SegmentPostings<TDeleteSet: DeleteSet> {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
delete_bitset: DeleteBitSet,
|
||||
delete_bitset: TDeleteSet,
|
||||
position_computer: Option<UnsafeCell<PositionComputer>>,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
impl SegmentPostings<NoDelete> {
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> Self {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: NoDelete,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a segment postings object with the given documents
|
||||
/// and no frequency encoded.
|
||||
///
|
||||
@@ -82,14 +94,14 @@ impl SegmentPostings {
|
||||
/// It serializes the doc ids using tantivy's codec
|
||||
/// and returns a `SegmentPostings` object that embeds a
|
||||
/// buffer with the serialized data.
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings<NoDelete> {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false);
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
postings_serializer.close_term().unwrap();
|
||||
postings_serializer.close_term().expect("In memory Serialization should never fail.");
|
||||
}
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
@@ -97,8 +109,20 @@ impl SegmentPostings {
|
||||
SourceRead::from(data),
|
||||
FreqReadingOption::NoFreq,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, DeleteBitSet::empty(), None)
|
||||
SegmentPostings::from_block_postings(block_segment_postings, NoDelete, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDeleteSet: DeleteSet> SegmentPostings<TDeleteSet> {
|
||||
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
|
||||
if let Some(position_computer) = self.position_computer.as_ref() {
|
||||
let num_skips = num_skips_fn();
|
||||
unsafe {
|
||||
(*position_computer.get()).add_skip(num_skips);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
@@ -108,9 +132,9 @@ impl SegmentPostings {
|
||||
/// frequencies and/or positions
|
||||
pub fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings,
|
||||
delete_bitset: DeleteBitSet,
|
||||
delete_bitset: TDeleteSet,
|
||||
positions_stream_opt: Option<CompressedIntStream>,
|
||||
) -> SegmentPostings {
|
||||
) -> SegmentPostings<TDeleteSet> {
|
||||
let position_computer =
|
||||
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
|
||||
SegmentPostings {
|
||||
@@ -120,29 +144,9 @@ impl SegmentPostings {
|
||||
position_computer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: DeleteBitSet::empty(),
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
|
||||
if let Some(position_computer) = self.position_computer.as_ref() {
|
||||
let num_skips = num_skips_fn();
|
||||
unsafe {
|
||||
(*position_computer.get()).add_skip(num_skips);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
@@ -299,13 +303,14 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
|
||||
impl<TDeleteSet: DeleteSet> HasLen for SegmentPostings<TDeleteSet> {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for SegmentPostings {
|
||||
impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,9 @@ use std::borrow::Borrow;
|
||||
use query::Exclude;
|
||||
use query::Occur;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::IntersectionTwoTerms;
|
||||
use fastfield::DeleteBitSet;
|
||||
use postings::NoDelete;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use Result;
|
||||
|
||||
@@ -19,24 +22,41 @@ where
|
||||
{
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 {
|
||||
scorers.into_iter().next().unwrap() //< we checked the size beforehands
|
||||
} else {
|
||||
return scorers.into_iter().next().unwrap(); //< we checked the size beforehands
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
Downcast::<TermScorer<DeleteBitSet>>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
let scorers: Vec<TermScorer<DeleteBitSet>> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.map(|scorer| *Downcast::<TermScorer<DeleteBitSet>>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer, TScoreCombiner>::from(scorers);
|
||||
scorer
|
||||
} else {
|
||||
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
|
||||
scorer
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer<DeleteBitSet>, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer<NoDelete>>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer<NoDelete>> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer<NoDelete>>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer<NoDelete>, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
|
||||
return scorer;
|
||||
|
||||
}
|
||||
|
||||
pub struct BooleanWeight {
|
||||
@@ -74,26 +94,55 @@ impl BooleanWeight {
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
|
||||
let must_scorer_opt: Option<Box<Scorer>> =
|
||||
per_occur_scorers.remove(&Occur::Must).map(|scorers| {
|
||||
per_occur_scorers.remove(&Occur::Must).map(|mut scorers| {
|
||||
if scorers.len() == 1 {
|
||||
scorers.into_iter().next().unwrap()
|
||||
} else {
|
||||
return scorers.into_iter().next().unwrap();
|
||||
}
|
||||
scorers.sort_by_key(|scorer| scorer.size_hint());
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
Downcast::<TermScorer<DeleteBitSet>>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
scorer
|
||||
} else {
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
scorer
|
||||
if scorers.len() == 2 {
|
||||
let right = scorers.pop().unwrap();
|
||||
let left = scorers.pop().unwrap();
|
||||
return box IntersectionTwoTerms::new(left, right);
|
||||
} else {
|
||||
let mut scorers: Vec<TermScorer<DeleteBitSet>> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer<DeleteBitSet>>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer<NoDelete>>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
if scorers.len() == 2 {
|
||||
let right = scorers.pop().unwrap();
|
||||
let left = scorers.pop().unwrap();
|
||||
return box IntersectionTwoTerms::new(left, right);
|
||||
} else {
|
||||
let mut scorers: Vec<TermScorer<NoDelete>> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer<NoDelete>>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
scorer
|
||||
}
|
||||
});
|
||||
|
||||
let positive_scorer: Box<Scorer> = match (should_scorer_opt, must_scorer_opt) {
|
||||
|
||||
@@ -85,7 +85,7 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||
assert!(Downcast::<Intersection<TermScorer<NoDelete>>>::is_type(&*scorer));
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||
@@ -95,6 +95,8 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
use postings::NoDelete;
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_reqopt() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
@@ -110,7 +112,7 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&*searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
assert!(Downcast::<TermScorer<NoDelete>>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
73
src/query/intersection_two.rs
Normal file
73
src/query/intersection_two.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use docset::DocSet;
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use Score;
|
||||
use std::mem;
|
||||
use query::term_query::TermScorer;
|
||||
use postings::DeleteSet;
|
||||
use SkipResult;
|
||||
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionTwoTerms<TDocSet> {
|
||||
left: TDocSet,
|
||||
right: TDocSet
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> IntersectionTwoTerms<TDocSet> {
|
||||
pub fn new(left: TDocSet, mut right: TDocSet) -> IntersectionTwoTerms<TDocSet> {
|
||||
IntersectionTwoTerms {
|
||||
left,
|
||||
right
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for IntersectionTwoTerms<TDocSet> {
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
let (left, right) = (&mut self.left, &mut self.right);
|
||||
if !left.advance() {
|
||||
return false;
|
||||
}
|
||||
let mut candidate = left.doc();
|
||||
loop {
|
||||
match right.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
return true;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = right.doc();
|
||||
}
|
||||
}
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
return true;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.left.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.left.size_hint().min(self.right.size_hint())
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for IntersectionTwoTerms<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.left.score() + self.right.score()
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ mod range_query;
|
||||
mod exclude;
|
||||
mod union;
|
||||
mod intersection;
|
||||
mod intersection_two;
|
||||
mod reqopt_scorer;
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -45,3 +46,4 @@ pub use self::weight::Weight;
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::scorer::ConstScorer;
|
||||
pub use self::intersection_two::IntersectionTwoTerms;
|
||||
|
||||
@@ -2,14 +2,15 @@ use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::{Postings, SegmentPostings};
|
||||
use query::{Intersection, Scorer};
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
struct PostingsWithOffset {
|
||||
offset: u32,
|
||||
segment_postings: SegmentPostings,
|
||||
segment_postings: SegmentPostings<DeleteBitSet>,
|
||||
}
|
||||
|
||||
impl PostingsWithOffset {
|
||||
pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
|
||||
pub fn new(segment_postings: SegmentPostings<DeleteBitSet>, offset: u32) -> PostingsWithOffset {
|
||||
PostingsWithOffset {
|
||||
offset,
|
||||
segment_postings,
|
||||
@@ -50,7 +51,7 @@ pub struct PhraseScorer {
|
||||
}
|
||||
|
||||
impl PhraseScorer {
|
||||
pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
|
||||
pub fn new(term_postings: Vec<SegmentPostings<DeleteBitSet>>) -> PhraseScorer {
|
||||
let postings_with_offsets: Vec<_> = term_postings
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
|
||||
@@ -5,20 +5,21 @@ use postings::SegmentPostings;
|
||||
use query::Scorer;
|
||||
use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
use postings::{NoDelete, DeleteSet};
|
||||
|
||||
pub struct TermScorer {
|
||||
pub struct TermScorer<TDeleteSet: DeleteSet=NoDelete> {
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<FastFieldReader<u64>>,
|
||||
pub postings: SegmentPostings,
|
||||
pub postings: SegmentPostings<TDeleteSet>,
|
||||
}
|
||||
|
||||
impl TermScorer {
|
||||
pub fn postings(&self) -> &SegmentPostings {
|
||||
impl<TDeleteSet: DeleteSet> TermScorer<TDeleteSet> {
|
||||
pub fn postings(&self) -> &SegmentPostings<TDeleteSet> {
|
||||
&self.postings
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
impl<TDeleteSet: DeleteSet> DocSet for TermScorer<TDeleteSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
@@ -36,7 +37,7 @@ impl DocSet for TermScorer {
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for TermScorer {
|
||||
impl<TDeleteSet: DeleteSet> Scorer for TermScorer<TDeleteSet> {
|
||||
fn score(&mut self) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
let tf = match self.fieldnorm_reader_opt {
|
||||
|
||||
@@ -6,6 +6,8 @@ use docset::DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use schema::IndexRecordOption;
|
||||
use super::term_scorer::TermScorer;
|
||||
use fastfield::DeleteBitSet;
|
||||
use postings::{DeleteSet, NoDelete};
|
||||
use Result;
|
||||
|
||||
pub struct TermWeight {
|
||||
@@ -17,8 +19,46 @@ pub struct TermWeight {
|
||||
|
||||
impl Weight for TermWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let specialized_scorer = self.specialized_scorer(reader)?;
|
||||
Ok(box specialized_scorer)
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
let scorer: Box<Scorer>;
|
||||
if reader.has_deletes() {
|
||||
let postings_opt: Option<SegmentPostings<DeleteBitSet>> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option);
|
||||
scorer =
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
box TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
}
|
||||
} else {
|
||||
box TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::<NoDelete>::empty(),
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let postings_opt: Option<SegmentPostings<NoDelete>> =
|
||||
inverted_index.read_postings_no_deletes(&self.term, self.index_record_option);
|
||||
scorer =
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
box TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
}
|
||||
} else {
|
||||
box TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::<NoDelete>::empty(),
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(scorer)
|
||||
}
|
||||
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
@@ -30,7 +70,7 @@ impl Weight for TermWeight {
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0))
|
||||
} else {
|
||||
Ok(self.specialized_scorer(reader)?.count())
|
||||
Ok(self.scorer(reader)?.count())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -39,26 +79,4 @@ impl TermWeight {
|
||||
fn idf(&self) -> f32 {
|
||||
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
|
||||
}
|
||||
|
||||
/// If the field is not found, returns an empty `DocSet`.
|
||||
pub fn specialized_scorer(&self, reader: &SegmentReader) -> Result<TermScorer> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
})
|
||||
} else {
|
||||
Ok(TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user