diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index cea7e4e4b..3eb11165d 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -7,6 +7,7 @@ use schema::Term; use fastfield::DeleteBitSet; use compression::CompressedIntStream; use postings::FreqReadingOption; +use postings::{DeleteSet, NoDelete}; /// The inverted index reader is in charge of accessing /// the inverted index associated to a specific field. @@ -25,7 +26,7 @@ pub struct InvertedIndexReader { termdict: TermDictionaryImpl, postings_source: ReadOnlySource, positions_source: ReadOnlySource, - delete_bitset: DeleteBitSet, + delete_bitset_opt: Option, record_option: IndexRecordOption, } @@ -34,14 +35,14 @@ impl InvertedIndexReader { termdict_source: ReadOnlySource, postings_source: ReadOnlySource, positions_source: ReadOnlySource, - delete_bitset: DeleteBitSet, + delete_bitset_opt: Option, record_option: IndexRecordOption, ) -> InvertedIndexReader { InvertedIndexReader { termdict: TermDictionaryImpl::from_source(termdict_source), postings_source, positions_source, - delete_bitset, + delete_bitset_opt, record_option, } } @@ -105,13 +106,15 @@ impl InvertedIndexReader { /// This method is for an advanced usage only. /// /// Most user should prefer using `read_postings` instead. - pub fn read_postings_from_terminfo( + pub fn read_postings_from_terminfo( &self, term_info: &TermInfo, option: IndexRecordOption, - ) -> SegmentPostings { + ) -> SegmentPostings { let block_postings = self.read_block_postings_from_terminfo(term_info, option); - let delete_bitset = self.delete_bitset.clone(); + let delete_set = TDeleteSet::from(self.delete_bitset_opt.iter() + .cloned() + .next()); let position_stream = { if option.has_positions() { let position_offset = term_info.positions_offset; @@ -123,7 +126,7 @@ impl InvertedIndexReader { None } }; - SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream) + SegmentPostings::from_block_postings(block_postings, delete_set, position_stream) } /// Returns the segment postings associated with the term, and with the given option, @@ -136,11 +139,17 @@ impl InvertedIndexReader { /// For instance, requesting `IndexRecordOption::Freq` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { + pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option> { let term_info = get!(self.get_term_info(term)); Some(self.read_postings_from_terminfo(&term_info, option)) } + pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option> { + let term_info = get!(self.get_term_info(term)); + Some(self.read_postings_from_terminfo(&term_info, option)) + } + + /// Returns the number of documents containing the term. pub fn doc_freq(&self, term: &Term) -> u32 { self.get_term_info(term) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 85ddbfea0..670223812 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -26,6 +26,7 @@ use schema::Schema; use termdict::TermDictionary; use fastfield::{FastValue, MultiValueIntFastFieldReader}; use schema::Cardinality; +use postings::DeleteSet; /// Entry point to access all of the datastructures of the `Segment` /// @@ -54,7 +55,7 @@ pub struct SegmentReader { fieldnorms_composite: CompositeFile, store_reader: StoreReader, - delete_bitset: DeleteBitSet, + delete_bitset_opt: Option, schema: Schema, } @@ -79,7 +80,13 @@ impl SegmentReader { /// Return the number of documents that have been /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { - self.delete_bitset.len() as DocId + self.delete_bitset() + .map(|delete_set| delete_set.len() as DocId) + .unwrap_or(0u32) + } + + pub fn has_deletes(&self) -> bool { + self.num_deleted_docs() > 0 } /// Accessor to a segment's fast field reader given a field. @@ -194,12 +201,13 @@ impl SegmentReader { let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?; - let delete_bitset = if segment.meta().has_deletes() { - let delete_data = segment.open_read(SegmentComponent::DELETE)?; - DeleteBitSet::open(delete_data) - } else { - DeleteBitSet::empty() - }; + let delete_bitset_opt = + if segment.meta().has_deletes() { + let delete_data = segment.open_read(SegmentComponent::DELETE)?; + Some(DeleteBitSet::open(delete_data)) + } else { + None + }; let schema = segment.schema(); Ok(SegmentReader { @@ -211,7 +219,7 @@ impl SegmentReader { fieldnorms_composite, segment_id: segment.id(), store_reader, - delete_bitset, + delete_bitset_opt, positions_composite, schema, }) @@ -253,7 +261,7 @@ impl SegmentReader { termdict_source, postings_source, positions_source, - self.delete_bitset.clone(), + self.delete_bitset_opt.clone(), record_option, )); @@ -282,14 +290,16 @@ impl SegmentReader { /// Returns the bitset representing /// the documents that have been deleted. - pub fn delete_bitset(&self) -> &DeleteBitSet { - &self.delete_bitset + pub fn delete_bitset(&self) -> Option<&DeleteBitSet> { + self.delete_bitset_opt.as_ref() } /// Returns true iff the `doc` is marked /// as deleted. pub fn is_deleted(&self, doc: DocId) -> bool { - self.delete_bitset.is_deleted(doc) + self.delete_bitset() + .map(|delete_set| delete_set.is_deleted(doc)) + .unwrap_or(false) } } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 02634ed0f..3134c3fc1 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -5,6 +5,7 @@ use std::io; use directory::ReadOnlySource; use DocId; use common::HasLen; +use postings::DeleteSet; /// Write a delete `BitSet` /// @@ -51,22 +52,19 @@ impl DeleteBitSet { } } - /// Returns an empty delete bit set. - pub fn empty() -> DeleteBitSet { +} + + +impl DeleteSet for DeleteBitSet { + + fn empty() -> DeleteBitSet { DeleteBitSet { data: ReadOnlySource::empty(), len: 0, } } - /// Returns true iff the segment has some deleted documents. - pub fn has_deletes(&self) -> bool { - self.len() > 0 - } - - /// Returns true iff the document is deleted. - #[inline] - pub fn is_deleted(&self, doc: DocId) -> bool { + fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false } else { @@ -76,6 +74,18 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } } + +} + + +impl From> for DeleteBitSet { + fn from(delete_bitset_opt: Option) -> Self { + if let Some(delete_bitset) = delete_bitset_opt { + delete_bitset + } else { + DeleteBitSet::empty() + } + } } impl HasLen for DeleteBitSet { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 7e1a0580b..57f03a80a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -17,6 +17,7 @@ use store::StoreWriter; use std::cmp::{max, min}; use termdict::TermDictionary; use termdict::TermStreamer; +use postings::DeleteSet; pub struct IndexMerger { schema: Schema, @@ -27,22 +28,28 @@ pub struct IndexMerger { fn compute_min_max_val( u64_reader: &FastFieldReader, max_doc: DocId, - delete_bitset: &DeleteBitSet, + delete_bitset_opt: Option<&DeleteBitSet>, ) -> Option<(u64, u64)> { if max_doc == 0 { None - } else if !delete_bitset.has_deletes() { - // no deleted documents, - // we can use the previous min_val, max_val. - Some((u64_reader.min_value(), u64_reader.max_value())) } else { - // some deleted documents, - // we need to recompute the max / min - (0..max_doc) - .filter(|doc_id| !delete_bitset.is_deleted(*doc_id)) - .map(|doc_id| u64_reader.get(doc_id)) - .minmax() - .into_option() + match delete_bitset_opt { + Some(delete_bitset) => { + // some deleted documents, + // we need to recompute the max / min + (0..max_doc) + .filter(|doc_id| !delete_bitset.is_deleted(*doc_id)) + .map(|doc_id| u64_reader.get(doc_id)) + .minmax() + .into_option() + + } + None => { + // no deleted documents, + // we can use the previous min_val, max_val. + Some((u64_reader.min_value(), u64_reader.max_value())) + } + } } } @@ -150,7 +157,7 @@ impl IndexMerger { if let Some((seg_min_val, seg_max_val)) = compute_min_max_val( &u64_reader, reader.max_doc(), - reader.delete_bitset(), + reader.delete_bitset() ) { // the segment has some non-deleted documents min_val = min(min_val, seg_min_val); @@ -181,12 +188,15 @@ impl IndexMerger { let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(field, min_val, max_val)?; - for (max_doc, u64_reader, delete_bitset) in u64_readers { + for (max_doc, u64_reader, delete_bitset_opt) in u64_readers { for doc_id in 0..max_doc { - if !delete_bitset.is_deleted(doc_id) { - let val = u64_reader.get(doc_id); - fast_single_field_serializer.add_val(val)?; + if let Some(ref delete_bitset) = delete_bitset_opt { + if delete_bitset.is_deleted(doc_id) { + continue; + } } + let val = u64_reader.get(doc_id); + fast_single_field_serializer.add_val(val)?; } } @@ -271,7 +281,7 @@ impl IndexMerger { let segment_reader = &self.readers[heap_item.segment_ord]; let inverted_index = segment_reader.inverted_index(indexed_field); let mut segment_postings = inverted_index - .read_postings_from_terminfo(term_info, segment_postings_option); + .read_postings_from_terminfo::(term_info, segment_postings_option); if segment_postings.advance() { Some((segment_ord, segment_postings)) } else { diff --git a/src/postings/delete_set.rs b/src/postings/delete_set.rs new file mode 100644 index 000000000..569611473 --- /dev/null +++ b/src/postings/delete_set.rs @@ -0,0 +1,27 @@ +use fastfield::DeleteBitSet; +use DocId; + +pub trait DeleteSet: 'static + From> { + fn is_deleted(&self, doc: DocId) -> bool; + fn empty() -> Self; +} + + +#[derive(Default)] +pub struct NoDelete; +impl DeleteSet for NoDelete { + #[inline(always)] + fn is_deleted(&self, doc: DocId) -> bool { + false + } + fn empty() -> Self { + NoDelete + } +} + +impl From> for NoDelete { + fn from(delete_bitset_opt: Option) -> Self { + assert!(delete_bitset_opt.is_none(), "NoDelete should not be used if there are some deleted documents."); + NoDelete + } +} \ No newline at end of file diff --git a/src/postings/mod.rs b/src/postings/mod.rs index c67a9f855..31e7cc6cc 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -13,11 +13,13 @@ mod serializer; mod postings_writer; mod term_info; mod segment_postings; +mod delete_set; use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder}; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; +pub use self::delete_set::{DeleteSet, NoDelete}; pub use self::term_info::TermInfo; pub use self::postings::Postings; @@ -42,7 +44,7 @@ pub mod tests { use DocId; use Score; use query::Intersection; - use query::Scorer; + use query::{Weight, Scorer}; use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT}; use core::SegmentComponent; use indexer::SegmentWriter; @@ -97,53 +99,54 @@ pub mod tests { index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.commit().unwrap(); index.load_searchers().unwrap(); + let searcher = index.searcher(); - let query = TermQuery::new( - Term::from_field_text(title, "abc"), - IndexRecordOption::WithFreqsAndPositions, - ); - let weight = query.specialized_weight(&*searcher, true); + let inverted_index = searcher.segment_reader(0u32).inverted_index(title); + let term = Term::from_field_text(title, "abc"); + + { - let mut scorer = weight - .specialized_scorer(searcher.segment_reader(0u32)) + let mut postings = inverted_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); - scorer.advance(); - assert_eq!(&[0, 1, 2], scorer.postings().positions()); - scorer.advance(); - assert_eq!(&[0, 5], scorer.postings().positions()); + postings.advance(); + assert_eq!(&[0, 1, 2], postings.positions()); + postings.advance(); + assert_eq!(&[0, 5], postings.positions()); } { - let mut scorer = weight - .specialized_scorer(searcher.segment_reader(0u32)) + let mut postings = inverted_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); - scorer.advance(); - scorer.advance(); - assert_eq!(&[0, 5], scorer.postings().positions()); + postings.advance(); + postings.advance(); + assert_eq!(&[0, 5], postings.positions()); } { - let mut scorer = weight - .specialized_scorer(searcher.segment_reader(0u32)) + + let mut postings = inverted_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); - assert_eq!(scorer.skip_next(1), SkipResult::Reached); - assert_eq!(scorer.doc(), 1); - assert_eq!(&[0, 5], scorer.postings().positions()); + assert_eq!(postings.skip_next(1), SkipResult::Reached); + assert_eq!(postings.doc(), 1); + assert_eq!(&[0, 5], postings.positions()); } { - let mut scorer = weight - .specialized_scorer(searcher.segment_reader(0u32)) + let mut postings = inverted_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); - assert_eq!(scorer.skip_next(1002), SkipResult::Reached); - assert_eq!(scorer.doc(), 1002); - assert_eq!(&[0, 5], scorer.postings().positions()); + assert_eq!(postings.skip_next(1002), SkipResult::Reached); + assert_eq!(postings.doc(), 1002); + assert_eq!(&[0, 5], postings.positions()); } { - let mut scorer = weight - .specialized_scorer(searcher.segment_reader(0u32)) + let mut postings = inverted_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .unwrap(); - assert_eq!(scorer.skip_next(100), SkipResult::Reached); - assert_eq!(scorer.skip_next(1002), SkipResult::Reached); - assert_eq!(scorer.doc(), 1002); - assert_eq!(&[0, 5], scorer.postings().positions()); + assert_eq!(postings.skip_next(100), SkipResult::Reached); + assert_eq!(postings.skip_next(1002), SkipResult::Reached); + assert_eq!(postings.doc(), 1002); + assert_eq!(&[0, 5], postings.positions()); } } @@ -277,18 +280,16 @@ pub mod tests { assert!(index_writer.commit().is_ok()); } index.load_searchers().unwrap(); - let term_query = TermQuery::new( - Term::from_field_text(text_field, "a"), - IndexRecordOption::Basic, - ); + let term_a = Term::from_field_text(text_field, "a"); let searcher = index.searcher(); - let mut term_weight = term_query.specialized_weight(&*searcher, true); - term_weight.index_record_option = IndexRecordOption::WithFreqsAndPositions; - let segment_reader = &searcher.segment_readers()[0]; - let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap(); - assert!(term_scorer.advance()); - assert_eq!(term_scorer.doc(), 1u32); - assert_eq!(term_scorer.postings().positions(), &[1u32, 4]); + let segment_reader = searcher.segment_reader(0); + let mut postings = segment_reader + .inverted_index(text_field) + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 1u32); + assert_eq!(postings.positions(), &[1u32, 4]); } #[test] diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 239e8c306..bdeddbecf 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -8,6 +8,7 @@ use docset::{DocSet, SkipResult}; use std::cmp; use fst::Streamer; use compression::compressed_block_size; +use postings::{NoDelete, DeleteSet}; use fastfield::DeleteBitSet; use std::cell::UnsafeCell; use directory::{ReadOnlySource, SourceRead}; @@ -66,14 +67,25 @@ impl PositionComputer { /// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings { +pub struct SegmentPostings { block_cursor: BlockSegmentPostings, cur: usize, - delete_bitset: DeleteBitSet, + delete_bitset: TDeleteSet, position_computer: Option>, } -impl SegmentPostings { +impl SegmentPostings { + /// Returns an empty segment postings object + pub fn empty() -> Self { + let empty_block_cursor = BlockSegmentPostings::empty(); + SegmentPostings { + block_cursor: empty_block_cursor, + delete_bitset: NoDelete, + cur: COMPRESSION_BLOCK_SIZE, + position_computer: None, + } + } + /// Creates a segment postings object with the given documents /// and no frequency encoded. /// @@ -82,14 +94,14 @@ impl SegmentPostings { /// It serializes the doc ids using tantivy's codec /// and returns a `SegmentPostings` object that embeds a /// buffer with the serialized data. - pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { + pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { let mut buffer = Vec::new(); { let mut postings_serializer = PostingsSerializer::new(&mut buffer, false); for &doc in docs { postings_serializer.write_doc(doc, 1u32).unwrap(); } - postings_serializer.close_term().unwrap(); + postings_serializer.close_term().expect("In memory Serialization should never fail."); } let data = ReadOnlySource::from(buffer); let block_segment_postings = BlockSegmentPostings::from_data( @@ -97,8 +109,20 @@ impl SegmentPostings { SourceRead::from(data), FreqReadingOption::NoFreq, ); - SegmentPostings::from_block_postings(block_segment_postings, DeleteBitSet::empty(), None) + SegmentPostings::from_block_postings(block_segment_postings, NoDelete, None) } +} + +impl SegmentPostings { + fn position_add_skip usize>(&self, num_skips_fn: F) { + if let Some(position_computer) = self.position_computer.as_ref() { + let num_skips = num_skips_fn(); + unsafe { + (*position_computer.get()).add_skip(num_skips); + } + } + } + /// Reads a Segment postings from an &[u8] /// @@ -108,9 +132,9 @@ impl SegmentPostings { /// frequencies and/or positions pub fn from_block_postings( segment_block_postings: BlockSegmentPostings, - delete_bitset: DeleteBitSet, + delete_bitset: TDeleteSet, positions_stream_opt: Option, - ) -> SegmentPostings { + ) -> SegmentPostings { let position_computer = positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream))); SegmentPostings { @@ -120,29 +144,9 @@ impl SegmentPostings { position_computer, } } - - /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings { - let empty_block_cursor = BlockSegmentPostings::empty(); - SegmentPostings { - block_cursor: empty_block_cursor, - delete_bitset: DeleteBitSet::empty(), - cur: COMPRESSION_BLOCK_SIZE, - position_computer: None, - } - } - - fn position_add_skip usize>(&self, num_skips_fn: F) { - if let Some(position_computer) = self.position_computer.as_ref() { - let num_skips = num_skips_fn(); - unsafe { - (*position_computer.get()).add_skip(num_skips); - } - } - } } -impl DocSet for SegmentPostings { +impl DocSet for SegmentPostings { // goes to the next element. // next needs to be called a first time to point to the correct element. #[inline] @@ -299,13 +303,14 @@ impl DocSet for SegmentPostings { } } -impl HasLen for SegmentPostings { + +impl HasLen for SegmentPostings { fn len(&self) -> usize { self.block_cursor.doc_freq() } } -impl Postings for SegmentPostings { +impl Postings for SegmentPostings { fn term_freq(&self) -> u32 { self.block_cursor.freq(self.cur) } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 25f56693b..f2fa7c18f 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -10,6 +10,9 @@ use std::borrow::Borrow; use query::Exclude; use query::Occur; use query::RequiredOptionalScorer; +use query::IntersectionTwoTerms; +use fastfield::DeleteBitSet; +use postings::NoDelete; use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner}; use Result; @@ -19,24 +22,41 @@ where { assert!(!scorers.is_empty()); if scorers.len() == 1 { - scorers.into_iter().next().unwrap() //< we checked the size beforehands - } else { + return scorers.into_iter().next().unwrap(); //< we checked the size beforehands + } + + { let is_all_term_queries = scorers.iter().all(|scorer| { let scorer_ref: &Scorer = scorer.borrow(); - Downcast::::is_type(scorer_ref) + Downcast::>::is_type(scorer_ref) }); if is_all_term_queries { - let scorers: Vec = scorers + let scorers: Vec> = scorers .into_iter() - .map(|scorer| *Downcast::::downcast(scorer).unwrap()) + .map(|scorer| *Downcast::>::downcast(scorer).unwrap()) .collect(); - let scorer: Box = box Union::::from(scorers); - scorer - } else { - let scorer: Box = box Union::<_, TScoreCombiner>::from(scorers); - scorer + let scorer: Box = box Union::, TScoreCombiner>::from(scorers); + return scorer; } } + + { + let is_all_term_queries = scorers.iter().all(|scorer| { + let scorer_ref: &Scorer = scorer.borrow(); + Downcast::>::is_type(scorer_ref) + }); + if is_all_term_queries { + let scorers: Vec> = scorers + .into_iter() + .map(|scorer| *Downcast::>::downcast(scorer).unwrap()) + .collect(); + let scorer: Box = box Union::, TScoreCombiner>::from(scorers); + return scorer; + } + } + let scorer: Box = box Union::<_, TScoreCombiner>::from(scorers); + return scorer; + } pub struct BooleanWeight { @@ -74,26 +94,55 @@ impl BooleanWeight { .map(scorer_union::); let must_scorer_opt: Option> = - per_occur_scorers.remove(&Occur::Must).map(|scorers| { + per_occur_scorers.remove(&Occur::Must).map(|mut scorers| { if scorers.len() == 1 { - scorers.into_iter().next().unwrap() - } else { + return scorers.into_iter().next().unwrap(); + } + scorers.sort_by_key(|scorer| scorer.size_hint()); + { let is_all_term_queries = scorers.iter().all(|scorer| { let scorer_ref: &Scorer = scorer.borrow(); - Downcast::::is_type(scorer_ref) + Downcast::>::is_type(scorer_ref) }); if is_all_term_queries { - let scorers: Vec = scorers - .into_iter() - .map(|scorer| *Downcast::::downcast(scorer).unwrap()) - .collect(); - let scorer: Box = box Intersection::from(scorers); - scorer - } else { - let scorer: Box = box Intersection::from(scorers); - scorer + if scorers.len() == 2 { + let right = scorers.pop().unwrap(); + let left = scorers.pop().unwrap(); + return box IntersectionTwoTerms::new(left, right); + } else { + let mut scorers: Vec> = scorers + .into_iter() + .map(|scorer| *Downcast::>::downcast(scorer).unwrap()) + .collect(); + let scorer: Box = box Intersection::from(scorers); + return scorer; + } } } + { + let is_all_term_queries = scorers.iter().all(|scorer| { + let scorer_ref: &Scorer = scorer.borrow(); + Downcast::>::is_type(scorer_ref) + }); + if is_all_term_queries { + if scorers.len() == 2 { + let right = scorers.pop().unwrap(); + let left = scorers.pop().unwrap(); + return box IntersectionTwoTerms::new(left, right); + } else { + let mut scorers: Vec> = scorers + .into_iter() + .map(|scorer| *Downcast::>::downcast(scorer).unwrap()) + .collect(); + let scorer: Box = box Intersection::from(scorers); + return scorer; + } + } + } + { + let scorer: Box = box Intersection::from(scorers); + scorer + } }); let positive_scorer: Box = match (should_scorer_opt, must_scorer_opt) { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index fd7393050..ca78939ad 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -85,7 +85,7 @@ mod tests { let query = query_parser.parse_query("+a +b +c").unwrap(); let weight = query.weight(&*searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); - assert!(Downcast::>::is_type(&*scorer)); + assert!(Downcast::>>::is_type(&*scorer)); } { let query = query_parser.parse_query("+a +(b c)").unwrap(); @@ -95,6 +95,8 @@ mod tests { } } + use postings::NoDelete; + #[test] pub fn test_boolean_reqopt() { let (index, text_field) = aux_test_helper(); @@ -110,7 +112,7 @@ mod tests { let query = query_parser.parse_query("+a b").unwrap(); let weight = query.weight(&*searcher, false).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); - assert!(Downcast::::is_type(&*scorer)); + assert!(Downcast::>::is_type(&*scorer)); } } diff --git a/src/query/intersection_two.rs b/src/query/intersection_two.rs new file mode 100644 index 000000000..59a7a2366 --- /dev/null +++ b/src/query/intersection_two.rs @@ -0,0 +1,73 @@ +use docset::DocSet; +use query::Scorer; +use DocId; +use Score; +use std::mem; +use query::term_query::TermScorer; +use postings::DeleteSet; +use SkipResult; + + +/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. +pub struct IntersectionTwoTerms { + left: TDocSet, + right: TDocSet +} + +impl IntersectionTwoTerms { + pub fn new(left: TDocSet, mut right: TDocSet) -> IntersectionTwoTerms { + IntersectionTwoTerms { + left, + right + } + } +} + +impl DocSet for IntersectionTwoTerms { + + fn advance(&mut self) -> bool { + let (left, right) = (&mut self.left, &mut self.right); + if !left.advance() { + return false; + } + let mut candidate = left.doc(); + loop { + match right.skip_next(candidate) { + SkipResult::Reached => { + return true; + } + SkipResult::End => { + return false; + } + SkipResult::OverStep => { + candidate = right.doc(); + } + } + match left.skip_next(candidate) { + SkipResult::Reached => { + return true; + } + SkipResult::End => { + return false; + } + SkipResult::OverStep => { + candidate = left.doc(); + } + } + } + } + + fn doc(&self) -> DocId { + self.left.doc() + } + + fn size_hint(&self) -> u32 { + self.left.size_hint().min(self.right.size_hint()) + } +} + +impl Scorer for IntersectionTwoTerms { + fn score(&mut self) -> Score { + self.left.score() + self.right.score() + } +} diff --git a/src/query/mod.rs b/src/query/mod.rs index 1872372a9..df68004c2 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,6 +16,7 @@ mod range_query; mod exclude; mod union; mod intersection; +mod intersection_two; mod reqopt_scorer; #[cfg(test)] @@ -45,3 +46,4 @@ pub use self::weight::Weight; pub use self::all_query::{AllQuery, AllScorer, AllWeight}; pub use self::range_query::RangeQuery; pub use self::scorer::ConstScorer; +pub use self::intersection_two::IntersectionTwoTerms; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 47ae7d64e..a58dda6a5 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -2,14 +2,15 @@ use DocId; use docset::{DocSet, SkipResult}; use postings::{Postings, SegmentPostings}; use query::{Intersection, Scorer}; +use fastfield::DeleteBitSet; struct PostingsWithOffset { offset: u32, - segment_postings: SegmentPostings, + segment_postings: SegmentPostings, } impl PostingsWithOffset { - pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset { + pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset { PostingsWithOffset { offset, segment_postings, @@ -50,7 +51,7 @@ pub struct PhraseScorer { } impl PhraseScorer { - pub fn new(term_postings: Vec) -> PhraseScorer { + pub fn new(term_postings: Vec>) -> PhraseScorer { let postings_with_offsets: Vec<_> = term_postings .into_iter() .enumerate() diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index d8352780c..9bb0763e3 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -5,20 +5,21 @@ use postings::SegmentPostings; use query::Scorer; use postings::Postings; use fastfield::FastFieldReader; +use postings::{NoDelete, DeleteSet}; -pub struct TermScorer { +pub struct TermScorer { pub idf: Score, pub fieldnorm_reader_opt: Option>, - pub postings: SegmentPostings, + pub postings: SegmentPostings, } -impl TermScorer { - pub fn postings(&self) -> &SegmentPostings { +impl TermScorer { + pub fn postings(&self) -> &SegmentPostings { &self.postings } } -impl DocSet for TermScorer { +impl DocSet for TermScorer { fn advance(&mut self) -> bool { self.postings.advance() } @@ -36,7 +37,7 @@ impl DocSet for TermScorer { } } -impl Scorer for TermScorer { +impl Scorer for TermScorer { fn score(&mut self) -> Score { let doc = self.postings.doc(); let tf = match self.fieldnorm_reader_opt { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 224718ab1..2b2da02df 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -6,6 +6,8 @@ use docset::DocSet; use postings::SegmentPostings; use schema::IndexRecordOption; use super::term_scorer::TermScorer; +use fastfield::DeleteBitSet; +use postings::{DeleteSet, NoDelete}; use Result; pub struct TermWeight { @@ -17,8 +19,46 @@ pub struct TermWeight { impl Weight for TermWeight { fn scorer(&self, reader: &SegmentReader) -> Result> { - let specialized_scorer = self.specialized_scorer(reader)?; - Ok(box specialized_scorer) + let field = self.term.field(); + let inverted_index = reader.inverted_index(field); + let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); + let scorer: Box; + if reader.has_deletes() { + let postings_opt: Option> = + inverted_index.read_postings(&self.term, self.index_record_option); + scorer = + if let Some(segment_postings) = postings_opt { + box TermScorer { + idf: self.idf(), + fieldnorm_reader_opt, + postings: segment_postings, + } + } else { + box TermScorer { + idf: 1f32, + fieldnorm_reader_opt: None, + postings: SegmentPostings::::empty(), + } + }; + } else { + let postings_opt: Option> = + inverted_index.read_postings_no_deletes(&self.term, self.index_record_option); + scorer = + if let Some(segment_postings) = postings_opt { + box TermScorer { + idf: self.idf(), + fieldnorm_reader_opt, + postings: segment_postings, + } + } else { + box TermScorer { + idf: 1f32, + fieldnorm_reader_opt: None, + postings: SegmentPostings::::empty(), + } + }; + } + Ok(scorer) } fn count(&self, reader: &SegmentReader) -> Result { @@ -30,7 +70,7 @@ impl Weight for TermWeight { .map(|term_info| term_info.doc_freq) .unwrap_or(0)) } else { - Ok(self.specialized_scorer(reader)?.count()) + Ok(self.scorer(reader)?.count()) } } } @@ -39,26 +79,4 @@ impl TermWeight { fn idf(&self) -> f32 { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - - /// If the field is not found, returns an empty `DocSet`. - pub fn specialized_scorer(&self, reader: &SegmentReader) -> Result { - let field = self.term.field(); - let inverted_index = reader.inverted_index(field); - let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - let postings_opt: Option = - inverted_index.read_postings(&self.term, self.index_record_option); - if let Some(segment_postings) = postings_opt { - Ok(TermScorer { - idf: self.idf(), - fieldnorm_reader_opt, - postings: segment_postings, - }) - } else { - Ok(TermScorer { - idf: 1f32, - fieldnorm_reader_opt: None, - postings: SegmentPostings::empty(), - }) - } - } }