Compare commits

..

7 Commits

Author SHA1 Message Date
Paul Masurel
bc79969cb7 Updated CHANGELOG 2020-05-20 22:23:37 +09:00
Rob Young
1b39a48247 Changes required by rebase on e25284
- Pass Collector into TweakedScoreTopCollector and
  CustomScoreTopCollector.
- Add std:: qualifier to f32, i32 etc. Not sure why this was not failing
  already.
- Add unit tests for TopDocs with offset including for tweaked and
  custom score collectors.

In order to convert a TopCollector<Score> to a TopCollector<TScore> I
had to add a `into_tscore` method to `TopCollector`. This is a hack but
I don't know how to avoid it.
2020-05-16 15:14:13 +01:00
Rob Young
25b1fdf8d2 Address review comments
- Make Debug formatting of TopDocs clearer.
- Add unit tests for limit and offset on TopCollector.
- Change API for using offset to a fluent interface.
- Add some context to the docstring to clarify what limit and offset are
  equivalent to in other projects.
2020-05-16 11:16:09 +01:00
Rob Young
3f2cd73ecb Add offset to TopDocsCollector
Add an offset to TopDocsCollector and TopDocs to make it clearer how to
handle pagination.

Closes #822
2020-05-16 11:16:09 +01:00
Paul Masurel
e25284bafe Major change in the DocSet/Scorer API (#824)
- Change in the DocSet and Scorer API. (@fulmicoton). 
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
As a result, iterating through DocSet now looks as follows
```rust
let mut doc = docset.doc();
while doc != TERMINATED {
   // ...
   doc = docset.advance();
}
```
The change made it possible to greatly simplify a lot of the docset's code.
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
2020-05-16 16:33:36 +09:00
Fisher Darling
8b67877cd5 Made field methods const fns (#823) 2020-05-16 10:59:50 +09:00
Rob Young
9de1360538 Minor doc and test improvements around fuzzy querying (#825) 2020-05-16 10:59:24 +09:00
51 changed files with 1253 additions and 2313 deletions

View File

@@ -3,6 +3,21 @@ Tantivy 0.13.0
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
- `MMapDirectory::open` does not return a `Result` anymore.
- Change in the DocSet and Scorer API. (@fulmicoton).
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
As a result, iterating through DocSet now looks as follows
```rust
let mut doc = docset.doc();
while doc != TERMINATED {
// ...
doc = docset.advance();
}
```
The change made it possible to greatly simplify a lot of the docset's code.
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
- Added an offset option to the Top(.*)Collectors. (@robyoung)
Tantivy 0.12.0
======================

View File

@@ -50,7 +50,6 @@ murmurhash32 = "0.2"
chrono = "0.4"
smallvec = "1.0"
rayon = "1"
# ordered-float = "1"
[target.'cfg(windows)'.dependencies]
winapi = "0.3"
@@ -59,8 +58,6 @@ winapi = "0.3"
rand = "0.7"
maplit = "1"
matches = "0.1.8"
proptest = "0.9"
float-cmp = "0.6"
[dev-dependencies.fail]
version = "0.4"

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocId, DocSet, Index, Postings};
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -62,12 +62,11 @@ fn main() -> tantivy::Result<()> {
{
// this buffer will be used to request for positions
let mut positions: Vec<u32> = Vec::with_capacity(100);
while segment_postings.advance() {
// the number of time the term appears in the document.
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
let mut doc_id = segment_postings.doc();
while doc_id != TERMINATED {
// This MAY contains deleted documents as well.
if segment_reader.is_deleted(doc_id) {
doc_id = segment_postings.advance();
continue;
}
@@ -86,6 +85,7 @@ fn main() -> tantivy::Result<()> {
// Doc 2: TermFreq 1: [0]
// ```
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
doc_id = segment_postings.advance();
}
}
}

View File

@@ -11,13 +11,13 @@ impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
where
TScore: Clone + PartialOrd,
{
pub fn new(
pub(crate) fn new(
custom_scorer: TCustomScorer,
limit: usize,
collector: TopCollector<TScore>,
) -> CustomScoreTopCollector<TCustomScorer, TScore> {
CustomScoreTopCollector {
custom_scorer,
collector: TopCollector::with_limit(limit),
collector,
}
}
}

View File

@@ -1,6 +1,5 @@
use crate::collector::Collector;
use crate::collector::SegmentCollector;
use crate::docset::SkipResult;
use crate::fastfield::FacetReader;
use crate::schema::Facet;
use crate::schema::Field;
@@ -188,6 +187,11 @@ pub struct FacetSegmentCollector {
collapse_facet_ords: Vec<u64>,
}
enum SkipResult {
Found,
NotFound,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>(
target: &[u8],
collapse_it: &mut Peekable<I>,
@@ -197,14 +201,14 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
return SkipResult::NotFound;
}
Ordering::Equal => {
return SkipResult::Reached;
return SkipResult::Found;
}
},
None => {
return SkipResult::End;
return SkipResult::NotFound;
}
}
collapse_it.next();
@@ -281,7 +285,7 @@ impl Collector for FacetCollector {
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
SkipResult::Found => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
@@ -301,7 +305,7 @@ impl Collector for FacetCollector {
}
break;
}
SkipResult::End | SkipResult::OverStep => {
SkipResult::NotFound => {
collapse_mapping.push(0);
if !facet_streamer.advance() {
break;

View File

@@ -84,7 +84,7 @@ See the `custom_collector` example.
*/
use crate::{DocId, Searcher, Executor};
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentReader;
@@ -100,9 +100,6 @@ mod top_collector;
mod top_score_collector;
pub use self::top_score_collector::TopDocs;
#[cfg(test)]
pub(crate) use self::top_score_collector::TopScoreSegmentCollector;
mod custom_score_top_collector;
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
@@ -112,9 +109,7 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::FacetCollector;
use crate::fastfield::DeleteBitSet;
use crate::query::{Scorer, Weight};
use std::borrow::BorrowMut;
use crate::query::Scorer;
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.
@@ -122,8 +117,6 @@ pub trait Fruit: Send + downcast_rs::Downcast {}
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
///
@@ -163,18 +156,26 @@ pub trait Collector: Sync {
/// into one fruit.
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>;
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
let segment_readers = searcher.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
let segment_collector =
self.for_segment(segment_ord as u32, segment_reader)?;
Ok(segment_collector.collect_scorer(scorer.borrow_mut(), segment_reader.delete_bitset()))
},
segment_readers.iter().enumerate(),
)?;
self.merge_fruits(fruits)
/// Created a segment collector and
fn collect_segment(
&self,
scorer: &mut dyn Scorer,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, segment_reader)?;
if let Some(delete_bitset) = segment_reader.delete_bitset() {
scorer.for_each(&mut |doc, score| {
if delete_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
});
} else {
scorer.for_each(&mut |doc, score| {
segment_collector.collect(doc, score);
})
}
Ok(segment_collector.harvest())
}
}
@@ -183,7 +184,7 @@ pub trait Collector: Sync {
///
/// `.collect(doc, score)` will be called for every documents
/// matching the query.
pub trait SegmentCollector: 'static + Sized {
pub trait SegmentCollector: 'static {
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.
type Fruit: Fruit;
@@ -193,19 +194,6 @@ pub trait SegmentCollector: 'static + Sized {
/// Extract the fruit of the collection from the `SegmentCollector`.
fn harvest(self) -> Self::Fruit;
fn collect_scorer(mut self, scorer: &mut dyn Scorer, delete_bitset: Option<&DeleteBitSet>) -> Self::Fruit {
if let Some(delete_bitset) = delete_bitset {
scorer.for_each(&mut |doc, score| {
if delete_bitset.is_alive(doc) {
self.collect(doc, score);
}
});
} else {
scorer.for_each(&mut |doc, score| self.collect(doc, score));
}
self.harvest()
}
}
// -----------------------------------------------

View File

@@ -18,9 +18,9 @@ use std::collections::BinaryHeap;
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this
/// struct is never public.
struct ComparableDoc<T, D> {
feature: T,
doc: D,
pub(crate) struct ComparableDoc<T, D> {
pub feature: T,
pub doc: D,
}
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
@@ -57,6 +57,7 @@ impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
pub(crate) struct TopCollector<T> {
pub limit: usize,
pub offset: usize,
_marker: PhantomData<T>,
}
@@ -69,15 +70,23 @@ where
/// # Panics
/// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopCollector<T> {
assert!(limit > 0, "Limit must be strictly greater than 0.");
TopCollector {
if limit < 1 {
panic!("Limit must be strictly greater than 0.");
}
Self {
limit,
offset: 0,
_marker: PhantomData,
}
}
pub fn limit(&self) -> usize {
self.limit
/// Skip the first "offset" documents when collecting.
///
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
/// Lucene's TopDocsCollector.
pub fn and_offset(mut self, offset: usize) -> TopCollector<T> {
self.offset = offset;
self
}
pub fn merge_fruits(
@@ -90,7 +99,7 @@ where
let mut top_collector = BinaryHeap::new();
for child_fruit in children {
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
if top_collector.len() < (self.limit + self.offset) {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
@@ -102,6 +111,7 @@ where
Ok(top_collector
.into_sorted_vec()
.into_iter()
.skip(self.offset)
.map(|cdoc| (cdoc.feature, cdoc.doc))
.collect())
}
@@ -111,7 +121,23 @@ where
segment_id: SegmentLocalId,
_: &SegmentReader,
) -> crate::Result<TopSegmentCollector<F>> {
Ok(TopSegmentCollector::new(segment_id, self.limit))
Ok(TopSegmentCollector::new(
segment_id,
self.limit + self.offset,
))
}
/// Create a new TopCollector with the same limit and offset.
///
/// Ideally we would use Into but the blanket implementation seems to cause the Scorer traits
/// to fail.
#[doc(hidden)]
pub(crate) fn into_tscore<TScore: PartialOrd + Clone>(self) -> TopCollector<TScore> {
TopCollector {
limit: self.limit,
offset: self.offset,
_marker: PhantomData,
}
}
}
@@ -122,13 +148,13 @@ where
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> {
pub limit: usize,
limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_id: u32,
}
impl<T: PartialOrd> TopSegmentCollector<T> {
pub fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
limit,
heap: BinaryHeap::with_capacity(limit),
@@ -159,10 +185,6 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
self.heap.len() >= self.limit
}
pub fn pruning_score(&self) -> Option<T> {
self.heap.peek().map(|head| head.feature.clone())
}
/// Collects a document scored by the given feature
///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
@@ -189,7 +211,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
#[cfg(test)]
mod tests {
use super::TopSegmentCollector;
use super::{TopCollector, TopSegmentCollector};
use crate::DocAddress;
#[test]
@@ -250,6 +272,48 @@ mod tests {
top_collector_limit_3.harvest()[..2].to_vec(),
);
}
#[test]
fn test_top_collector_with_limit_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress(0, 1)),
(0.8, DocAddress(0, 2)),
(0.7, DocAddress(0, 3)),
(0.6, DocAddress(0, 4)),
(0.5, DocAddress(0, 5)),
]])
.unwrap();
assert_eq!(
results,
vec![(0.8, DocAddress(0, 2)), (0.7, DocAddress(0, 3)),]
);
}
#[test]
fn test_top_collector_with_limit_larger_than_set_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.unwrap();
assert_eq!(results, vec![(0.8, DocAddress(0, 2)),]);
}
#[test]
fn test_top_collector_with_limit_and_offset_larger_than_set() {
let collector = TopCollector::with_limit(2).and_offset(20);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.unwrap();
assert_eq!(results, vec![]);
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -1,20 +1,22 @@
use super::Collector;
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
use crate::collector::top_collector::TopCollector;
use crate::collector::top_collector::TopSegmentCollector;
use crate::collector::top_collector::{ComparableDoc, TopCollector};
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::docset::TERMINATED;
use crate::fastfield::FastFieldReader;
use crate::query::Scorer;
use crate::schema::Field;
use crate::{DocAddress, Executor, Searcher};
use crate::DocAddress;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentReader;
use std::collections::BinaryHeap;
use std::fmt;
use crate::query::{Weight, PruningScorerIfPossible};
/// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score.
@@ -58,7 +60,11 @@ pub struct TopDocs(TopCollector<Score>);
impl fmt::Debug for TopDocs {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TopDocs({})", self.0.limit())
write!(
f,
"TopDocs(limit={}, offset={})",
self.0.limit, self.0.offset
)
}
}
@@ -102,6 +108,45 @@ impl TopDocs {
TopDocs(TopCollector::with_limit(limit))
}
/// Skip the first "offset" documents when collecting.
///
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
/// Lucene's TopDocsCollector.
///
/// ```rust
/// use tantivy::collector::TopDocs;
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index};
///
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
///
/// assert_eq!(top_docs.len(), 2);
/// assert_eq!(&top_docs[0], &(0.5204813, DocAddress(0, 4)));
/// assert_eq!(&top_docs[1], &(0.4793185, DocAddress(0, 3)));
/// ```
pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset))
}
/// Set top-K to rank documents by a given fast field.
///
/// ```rust
@@ -282,7 +327,7 @@ impl TopDocs {
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
{
TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore())
}
/// Ranks the documents using a custom score.
@@ -396,7 +441,7 @@ impl TopDocs {
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
{
CustomScoreTopCollector::new(custom_score, self.0.limit())
CustomScoreTopCollector::new(custom_score, self.0.into_tscore())
}
}
@@ -418,64 +463,74 @@ impl Collector for TopDocs {
true
}
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
let segment_readers = searcher.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
match weight.pruning_scorer(segment_reader, 1.0f32)? {
PruningScorerIfPossible::NonPruning(mut scorer) => {
let segment_collector =
self.for_segment(segment_ord as u32, segment_reader)?;
let fruit =
segment_collector.collect_scorer(scorer.as_mut(), segment_reader.delete_bitset());
Ok(fruit)
}
PruningScorerIfPossible::Pruning(mut pruning_scorer) => {
let limit = self.0.limit;
let mut segment_collector =
self.for_segment(segment_ord as u32, segment_reader)?;
for _ in 0..limit {
if !pruning_scorer.advance() {
return Ok(segment_collector.harvest());
}
segment_collector.collect(pruning_scorer.doc(), pruning_scorer.score());
}
let mut pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
while pruning_scorer.advance_with_pruning(pruning_score) {
segment_collector.0.collect(pruning_scorer.doc(), pruning_scorer.score());
pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
}
Ok(segment_collector.harvest())
}
}
},
segment_readers.iter().enumerate(),
)?;
self.merge_fruits(fruits)
}
fn merge_fruits(
&self,
child_fruits: Vec<Vec<(Score, DocAddress)>>,
) -> crate::Result<Self::Fruit> {
self.0.merge_fruits(child_fruits)
}
fn collect_segment(
&self,
scorer: &mut dyn Scorer,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> =
BinaryHeap::with_capacity(self.0.limit + self.0.offset);
// first we fill the heap with the first `limit` elements.
let mut doc = scorer.doc();
while doc != TERMINATED && heap.len() < (self.0.limit + self.0.offset) {
if !segment_reader.is_deleted(doc) {
let score = scorer.score();
heap.push(ComparableDoc {
feature: score,
doc,
});
}
doc = scorer.advance();
}
let threshold = heap.peek().map(|el| el.feature).unwrap_or(std::f32::MIN);
if let Some(delete_bitset) = segment_reader.delete_bitset() {
scorer.for_each_pruning(threshold, &mut |doc, score| {
if delete_bitset.is_alive(doc) {
*heap.peek_mut().unwrap() = ComparableDoc {
feature: score,
doc,
};
}
heap.peek().map(|el| el.feature).unwrap_or(std::f32::MIN)
});
} else {
scorer.for_each_pruning(threshold, &mut |doc, score| {
*heap.peek_mut().unwrap() = ComparableDoc {
feature: score,
doc,
};
heap.peek().map(|el| el.feature).unwrap_or(std::f32::MIN)
});
}
let fruit = heap
.into_sorted_vec()
.into_iter()
.map(|cid| (cid.feature, DocAddress(segment_ord, cid.doc)))
.collect();
Ok(fruit)
}
}
/// Segment Collector associated to `TopDocs`.
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
impl TopScoreSegmentCollector {
pub fn new(segment_id: SegmentLocalId, limit: usize) -> Self {
TopScoreSegmentCollector(TopSegmentCollector::new(segment_id, limit))
}
}
impl SegmentCollector for TopScoreSegmentCollector {
type Fruit = Vec<(Score, DocAddress)>;
fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score)
self.0.collect(doc, score);
}
fn harvest(self) -> Vec<(Score, DocAddress)> {
@@ -489,10 +544,10 @@ mod tests {
use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::DocAddress;
use crate::Index;
use crate::IndexWriter;
use crate::Score;
use crate::{DocAddress, DocId, SegmentReader};
fn make_index() -> Index {
let mut schema_builder = Schema::builder();
@@ -532,6 +587,21 @@ mod tests {
);
}
#[test]
fn test_top_collector_not_at_capacity_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4).and_offset(2))
.unwrap();
assert_eq!(score_docs, vec![(0.48527452, DocAddress(0, 0))]);
}
#[test]
fn test_top_collector_at_capacity() {
let index = make_index();
@@ -553,6 +623,27 @@ mod tests {
);
}
#[test]
fn test_top_collector_at_capacity_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(2).and_offset(1))
.unwrap();
assert_eq!(
score_docs,
vec![
(0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0))
]
);
}
#[test]
fn test_top_collector_stable_sorting() {
let index = make_index();
@@ -666,6 +757,50 @@ mod tests {
}
}
#[test]
fn test_tweak_score_top_collector_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
);
}
#[test]
fn test_custom_score_top_collector_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2)
.and_offset(1)
.custom_score(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
);
}
fn index(
query: &str,
query_field: Field,

View File

@@ -14,11 +14,11 @@ where
{
pub fn new(
score_tweaker: TScoreTweaker,
limit: usize,
collector: TopCollector<TScore>,
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
TweakedScoreTopCollector {
score_tweaker,
collector: TopCollector::with_limit(limit),
collector,
}
}
}

View File

@@ -33,6 +33,10 @@ impl TinySet {
TinySet(0u64)
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
/// Returns the complement of the set in `[0, 64[`.
fn complement(self) -> TinySet {
TinySet(!self.0)
@@ -43,6 +47,11 @@ impl TinySet {
!self.intersect(TinySet::singleton(el)).is_empty()
}
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
@@ -109,22 +118,12 @@ impl TinySet {
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement()
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
pub fn len(self) -> u32 {
self.0.count_ones()
}
}
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize, //< Technically it should be u32, but we
// count multiple inserts.
// `usize` guards us from overflow.
len: usize,
max_value: u32,
}
@@ -204,7 +203,7 @@ mod tests {
use super::BitSet;
use super::TinySet;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::BitSetDocSet;
use crate::tests;
use crate::tests::generate_nonunique_unsorted;
@@ -278,11 +277,13 @@ mod tests {
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
bitset_docset.advance();
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!bitset_docset.advance());
assert!(!remaining);
}
#[test]

View File

@@ -1,10 +1,8 @@
use crate::collector::Collector;
use crate::collector::SegmentCollector;
use crate::core::Executor;
use crate::core::InvertedIndexReader;
use crate::core::SegmentReader;
use crate::query::Query;
use crate::query::Weight;
use crate::schema::Document;
use crate::schema::Schema;
use crate::schema::{Field, Term};
@@ -16,18 +14,6 @@ use crate::Index;
use std::fmt;
use std::sync::Arc;
fn collect_segment<C: Collector>(
collector: &C,
weight: &dyn Weight,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> crate::Result<C::Fruit> {
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
let segment_collector =
collector.for_segment(segment_ord as u32, segment_reader)?;
Ok(segment_collector.collect_scorer(&mut scorer, segment_reader.delete_bitset()))
}
/// Holds a list of `SegmentReader`s ready for search.
///
/// It guarantees that the `Segment` will not be removed before
@@ -154,12 +140,8 @@ impl Searcher {
let segment_readers = self.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
collect_segment(
collector,
weight.as_ref(),
segment_ord as u32,
segment_reader,
)
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
collector.collect_segment(scorer.as_mut(), segment_ord as u32, segment_reader)
},
segment_readers.iter().enumerate(),
)?;

View File

@@ -1,58 +1,47 @@
use crate::common::BitSet;
use crate::fastfield::DeleteBitSet;
use crate::DocId;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
#[derive(PartialEq, Eq, Debug)]
pub enum SkipResult {
/// target was in the docset
Reached,
/// target was not in the docset, skipping stopped as a greater element was found
OverStep,
/// the docset was entirely consumed without finding the target, nor any
/// element greater than the target.
End,
}
/// Sentinel value returned when a DocSet has been entirely consumed.
///
/// This is not u32::MAX as one would have expected, due to the lack of SSE2 instructions
/// to compare [u32; 4].
pub const TERMINATED: DocId = std::i32::MAX as u32;
/// Represents an iterable set of sorted doc ids.
pub trait DocSet {
/// Goes to the next element.
/// `.advance(...)` needs to be called a first time to point to the correct
/// element.
fn advance(&mut self) -> bool;
///
/// The DocId of the next element is returned.
/// In other words we should always have :
/// ```ignore
/// let doc = docset.advance();
/// assert_eq!(doc, docset.doc());
/// ```
///
/// If we reached the end of the DocSet, TERMINATED should be returned.
///
/// Calling `.advance()` on a terminated DocSet should be supported, and TERMINATED should
/// be returned.
/// TODO Test existing docsets.
fn advance(&mut self) -> DocId;
/// After skipping, position the iterator in such a way that `.doc()`
/// will return a value greater than or equal to target.
/// Advances the DocSet forward until reaching the target, or going to the
/// lowest DocId greater than the target.
///
/// SkipResult expresses whether the `target value` was reached, overstepped,
/// or if the `DocSet` was entirely consumed without finding any value
/// greater or equal to the `target`.
/// If the end of the DocSet is reached, TERMINATED is returned.
///
/// WARNING: Calling skip always advances the docset.
/// More specifically, if the docset is already positionned on the target
/// skipping will advance to the next position and return SkipResult::Overstep.
/// Calling `.seek(target)` on a terminated DocSet is legal. Implementation
/// of DocSet should support it.
///
/// If `.skip_next()` oversteps, then the docset must be positionned correctly
/// on an existing document. In other words, `.doc()` should return the first document
/// greater than `DocId`.
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
}
loop {
match self.doc().cmp(&target) {
Ordering::Less => {
if !self.advance() {
return SkipResult::End;
}
}
Ordering::Equal => return SkipResult::Reached,
Ordering::Greater => return SkipResult::OverStep,
}
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet.
fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc();
while doc < target {
doc = self.advance();
}
doc
}
/// Fills a given mutable buffer with the next doc ids from the
@@ -71,38 +60,38 @@ pub trait DocSet {
/// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
if self.doc() == TERMINATED {
return 0;
}
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
} else {
return i;
*buffer_val = self.doc();
if self.advance() == TERMINATED {
return i + 1;
}
}
buffer.len()
}
/// Returns the current document
/// Right after creating a new DocSet, the docset points to the first document.
///
/// If the DocSet is empty, .doc() should return `TERMINATED`.
fn doc(&self) -> DocId;
/// Returns a best-effort hint of the
/// length of the docset.
fn size_hint(&self) -> u32;
/// Appends all docs to a `bitset`.
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
while self.advance() {
bitset.insert(self.doc());
}
}
/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let mut count = 0u32;
while self.advance() {
if !delete_bitset.is_deleted(self.doc()) {
let mut doc = self.doc();
while doc != TERMINATED {
if !delete_bitset.is_deleted(doc) {
count += 1u32;
}
doc = self.advance();
}
count
}
@@ -114,22 +103,42 @@ pub trait DocSet {
/// given by `count()`.
fn count_including_deleted(&mut self) -> u32 {
let mut count = 0u32;
while self.advance() {
let mut doc = self.doc();
while doc != TERMINATED {
count += 1u32;
doc = self.advance();
}
count
}
}
impl<'a> DocSet for &'a mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()
}
fn seek(&mut self, target: DocId) -> DocId {
(**self).seek(target)
}
fn doc(&self) -> u32 {
(**self).doc()
}
fn size_hint(&self) -> u32 {
(**self).size_hint()
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
fn seek(&mut self, target: DocId) -> DocId {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.skip_next(target)
unboxed.seek(target)
}
fn doc(&self) -> DocId {
@@ -151,9 +160,4 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count_including_deleted()
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.append_to_bitset(bitset);
}
}

View File

@@ -10,7 +10,7 @@ use crate::core::SegmentMeta;
use crate::core::SegmentReader;
use crate::directory::TerminatingWrite;
use crate::directory::{DirectoryLock, GarbageCollectionResult};
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::error::TantivyError;
use crate::fastfield::write_delete_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
@@ -112,15 +112,15 @@ fn compute_deleted_bitset(
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
{
while docset.advance() {
let deleted_doc = docset.doc();
let mut deleted_doc = docset.doc();
while deleted_doc != TERMINATED {
if deleted_doc < limit_doc {
delete_bitset.insert(deleted_doc);
might_have_changed = true;
}
deleted_doc = docset.advance();
}
}
delete_cursor.advance();
}
Ok(might_have_changed)

View File

@@ -2,7 +2,7 @@ use crate::common::MAX_DOC_LIMIT;
use crate::core::Segment;
use crate::core::SegmentReader;
use crate::core::SerializableSegment;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FastFieldReader;
@@ -574,10 +574,12 @@ impl IndexMerger {
let inverted_index = segment_reader.inverted_index(indexed_field);
let mut segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option);
while segment_postings.advance() {
if !segment_reader.is_deleted(segment_postings.doc()) {
let mut doc = segment_postings.doc();
while doc != TERMINATED {
if !segment_reader.is_deleted(doc) {
return Some((segment_ord, segment_postings));
}
doc = segment_postings.advance();
}
None
})
@@ -604,17 +606,9 @@ impl IndexMerger {
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
let doc = segment_postings.doc();
// `.advance()` has been called once before the loop.
//
// It was required to make sure we only consider segments
// that effectively contain at least one non-deleted document
// and remove terms that do not have documents associated.
//
// For this reason, we cannot use a `while segment_postings.advance()` loop.
let mut doc = segment_postings.doc();
while doc != TERMINATED {
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
@@ -629,9 +623,8 @@ impl IndexMerger {
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
doc = segment_postings.advance();
}
}

View File

@@ -156,7 +156,7 @@ mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
pub use self::docset::{DocSet, SkipResult};
pub use self::docset::{DocSet, TERMINATED};
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
@@ -285,7 +285,7 @@ mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::DocAddress;
@@ -381,19 +381,12 @@ mod tests {
index_writer.commit().unwrap();
}
{
{
let doc = doc!(text_field=>"a");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a a");
index_writer.add_document(doc);
}
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit().unwrap();
}
{
let doc = doc!(text_field=>"c");
index_writer.add_document(doc);
index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit().unwrap();
}
{
@@ -472,10 +465,12 @@ mod tests {
}
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
while docset.advance() {
if !reader.is_deleted(docset.doc()) {
let mut doc = docset.advance();
while doc != TERMINATED {
if !reader.is_deleted(doc) {
return true;
}
doc = docset.advance();
}
false
}
@@ -641,9 +636,8 @@ mod tests {
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
@@ -665,9 +659,8 @@ mod tests {
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
@@ -689,9 +682,8 @@ mod tests {
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
@@ -760,10 +752,8 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
}
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
index_writer.commit().unwrap();
}
{
@@ -779,10 +769,9 @@ mod tests {
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);
assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED);
}
}

View File

@@ -1,16 +0,0 @@
use crate::postings::Postings;
use crate::DocId;
/// Inverted list with additional information about the maximum term frequency
/// within a block, as well as globally within the list.
pub trait BlockMaxPostings: Postings {
/// Returns the maximum frequency in the entire list.
fn max_term_freq(&self) -> u32;
/// Returns the maximum frequency in the current block.
fn block_max_term_freq(&mut self) -> u32;
/// Returns the document with the largest frequency.
fn max_doc(&self) -> DocId;
/// Returns the document with the largest frequency within the current
/// block.
fn block_max_doc(&self) -> DocId;
}

View File

@@ -1,76 +0,0 @@
use crate::postings::{BlockMaxPostings, Postings, SegmentPostings};
use crate::{DocId, DocSet, SkipResult};
/// A wrapper over [`SegmentPostings`](./struct.SegmentPostings.html)
/// with max block frequencies.
pub struct BlockMaxSegmentPostings {
postings: SegmentPostings,
max_blocks: SegmentPostings,
doc_with_max_term_freq: DocId,
max_term_freq: u32,
}
impl BlockMaxSegmentPostings {
/// Constructs a new segment postings with block-max information.
pub fn new(
postings: SegmentPostings,
max_blocks: SegmentPostings,
doc_with_max_term_freq: DocId,
max_term_freq: u32,
) -> Self {
Self {
postings,
max_blocks,
doc_with_max_term_freq,
max_term_freq,
}
}
}
impl DocSet for BlockMaxSegmentPostings {
fn advance(&mut self) -> bool {
self.postings.advance()
}
fn doc(&self) -> DocId {
self.postings.doc()
}
fn size_hint(&self) -> u32 {
self.postings.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
}
impl Postings for BlockMaxSegmentPostings {
fn term_freq(&self) -> u32 {
self.postings.term_freq()
}
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
self.postings.positions_with_offset(offset, output);
}
fn positions(&mut self, output: &mut Vec<u32>) {
self.postings.positions(output);
}
}
impl BlockMaxPostings for BlockMaxSegmentPostings {
fn max_term_freq(&self) -> u32 {
self.max_term_freq
}
fn block_max_term_freq(&mut self) -> u32 {
if let SkipResult::End = self.max_blocks.skip_next(self.doc()) {
panic!("Max blocks corrupted: reached end of max block");
}
self.max_blocks.term_freq()
}
fn max_doc(&self) -> DocId {
self.doc_with_max_term_freq
}
fn block_max_doc(&self) -> DocId {
self.max_blocks.doc()
}
}

View File

@@ -129,23 +129,23 @@ impl BlockSearcher {
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// of branch.
/// the block regardless of whether the block is full or not.
///
/// Indeed, if the block is not full, the remaining items are TERMINATED.
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
pub(crate) fn search_in_block(
self,
block_docs: &AlignedBuffer,
len: usize,
start: usize,
target: u32,
) -> usize {
#[cfg(target_arch = "x86_64")]
{
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
if self == BlockSearcher::SSE2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
start + galloping(&block_docs.0[start..len], target)
start + galloping(&block_docs.0[start..], target)
}
}
@@ -166,6 +166,7 @@ mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use crate::docset::TERMINATED;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
@@ -196,16 +197,11 @@ mod tests {
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
for i in 0..cursor {
assert_eq!(
block_searcher.search_in_block(
&AlignedBuffer(output_buffer),
block.len(),
i,
target
),
block_searcher.search_in_block(&AlignedBuffer(output_buffer), i, target),
cursor
);
}

View File

@@ -1,316 +0,0 @@
use crate::DocId;
use tantivy_fst::Streamer;
use crate::postings::{SkipReader, FreqReadingOption, USE_SKIP_INFO_LIMIT};
use owned_read::OwnedRead;
use crate::postings::compression::{BlockDecoder, COMPRESSION_BLOCK_SIZE, VIntDecoder, compressed_block_size, AlignedBuffer};
use crate::schema::IndexRecordOption;
use crate::common::{VInt, BinarySerializable};
fn split_into_skips_and_postings(
doc_freq: u32,
mut data: OwnedRead,
) -> (Option<OwnedRead>, OwnedRead) {
if doc_freq >= USE_SKIP_INFO_LIMIT {
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
let mut postings_data = data.clone();
postings_data.advance(skip_len);
data.clip(skip_len);
(Some(data), postings_data)
} else {
(None, data)
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
doc_freq: usize,
doc_offset: DocId,
num_vint_docs: usize,
remaining_data: OwnedRead,
skip_reader: SkipReader,
}
#[derive(Debug, Eq, PartialEq)]
pub enum BlockSegmentPostingsSkipResult {
Terminated,
Success(u32), //< number of term freqs to skip
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: u32,
data: OwnedRead,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
BlockSegmentPostings {
num_vint_docs,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
doc_offset: 0,
doc_freq,
remaining_data: postings_data,
skip_reader,
}
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data);
} else {
self.skip_reader.reset(OwnedRead::new(&[][..]))
}
self.doc_offset = 0;
self.doc_freq = doc_freq as usize;
}
/// Returns the document frequency associated to this block postings.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> usize {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.doc_decoder.output_array()
}
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
pub(crate) fn block_len(&self) -> usize {
self.doc_decoder.output_len
}
/// position on a block that may contains `doc_id`.
/// Always advance the current block.
///
/// Returns true if a block that has an element greater or equal to the target is found.
/// Returning true does not guarantee that the smallest element of the block is smaller
/// than the target. It only guarantees that the last element is greater or equal.
///
/// Returns false iff all of the document remaining are smaller than
/// `doc_id`. In that case, all of these document are consumed.
///
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
let mut skip_freqs = 0u32;
while self.skip_reader.advance() {
if self.skip_reader.doc() >= target_doc {
// the last document of the current block is larger
// than the target.
//
// We found our block!
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
self.doc_offset = self.skip_reader.doc();
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
} else {
skip_freqs += self.skip_reader.tf_sum();
let advance_len = self.skip_reader.total_block_len();
self.doc_offset = self.skip_reader.doc();
self.remaining_data.advance(advance_len);
}
}
// we are now on the last, incomplete, variable encoded block.
if self.num_vint_docs > 0 {
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
return self
.docs()
.last()
.map(|last_doc| {
if *last_doc >= target_doc {
BlockSegmentPostingsSkipResult::Success(skip_freqs)
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.skip_reader.advance() {
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
true
} else if self.num_vint_docs > 0 {
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
true
} else {
false
}
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_vint_docs: 0,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
doc_offset: 0,
doc_freq: 0,
remaining_data: OwnedRead::new(vec![]),
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
}
}
}
impl<'a> Streamer<'a> for BlockSegmentPostings {
type Item = &'a [DocId];
fn next(&'a mut self) -> Option<&'a [DocId]> {
if self.advance() {
Some(self.docs())
} else {
None
}
}
}

View File

@@ -1,4 +1,5 @@
use crate::common::FixedSize;
use crate::docset::TERMINATED;
use bitpacking::{BitPacker, BitPacker4x};
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
@@ -90,14 +91,18 @@ impl BlockDecoder {
}
#[inline]
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
(&self.output, self.output_len)
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
&self.output
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
}
pub fn clear(&mut self) {
self.output.0.iter_mut().for_each(|el| *el = TERMINATED);
}
}
pub trait VIntEncoder {
@@ -134,9 +139,9 @@ pub trait VIntDecoder {
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(
fn uncompress_vint_sorted(
&mut self,
compressed_data: &'a [u8],
compressed_data: &[u8],
offset: u32,
num_els: usize,
) -> usize;
@@ -146,7 +151,7 @@ pub trait VIntDecoder {
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
@@ -160,9 +165,9 @@ impl VIntEncoder for BlockEncoder {
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(
fn uncompress_vint_sorted(
&mut self,
compressed_data: &'a [u8],
compressed_data: &[u8],
offset: u32,
num_els: usize,
) -> usize {

View File

@@ -42,7 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
}
#[inline(always)]
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
let mut read_byte = 0;
let mut result = offset;
for output_mut in output.iter_mut() {

View File

@@ -2,8 +2,6 @@
Postings module (also called inverted index)
*/
mod block_max_postings;
mod block_max_segment_postings;
mod block_search;
pub(crate) mod compression;
/// Postings module
@@ -14,7 +12,6 @@ mod postings;
mod postings_writer;
mod recorder;
mod segment_postings;
mod block_segment_postings;
mod serializer;
mod skip;
mod stacker;
@@ -30,11 +27,7 @@ pub use self::postings::Postings;
pub(crate) use self::skip::SkipReader;
pub use self::term_info::TermInfo;
pub use self::segment_postings::SegmentPostings;
pub use self::block_segment_postings::BlockSegmentPostings;
pub use self::block_max_postings::BlockMaxPostings;
pub use self::block_max_segment_postings::BlockMaxSegmentPostings;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub(crate) use self::stacker::compute_table_size;
@@ -58,7 +51,7 @@ pub mod tests {
use crate::core::Index;
use crate::core::SegmentComponent;
use crate::core::SegmentReader;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter;
@@ -122,29 +115,12 @@ pub mod tests {
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
postings.advance();
assert_eq!(postings.doc(), 0);
postings.positions(&mut positions);
assert_eq!(&[0, 1, 2], &positions[..]);
postings.positions(&mut positions);
assert_eq!(&[0, 1, 2], &positions[..]);
postings.advance();
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
postings.advance();
postings.advance();
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.skip_next(1), SkipResult::Reached);
assert_eq!(postings.advance(), 1);
assert_eq!(postings.doc(), 1);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
@@ -153,7 +129,25 @@ pub mod tests {
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), 1);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.seek(1), 1);
assert_eq!(postings.doc(), 1);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.seek(1002), 1002);
assert_eq!(postings.doc(), 1002);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
@@ -162,8 +156,8 @@ pub mod tests {
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.skip_next(100), SkipResult::Reached);
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.seek(100), 100);
assert_eq!(postings.seek(1002), 1002);
assert_eq!(postings.doc(), 1002);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
@@ -288,22 +282,21 @@ pub mod tests {
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
postings_a.positions(&mut positions);
assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.advance(), 1u32);
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);
for i in 2u32..1000u32 {
assert!(postings_a.advance());
assert_eq!(postings_a.advance(), i);
assert_eq!(postings_a.term_freq(), 1);
postings_a.positions(&mut positions);
assert_eq!(&positions[..], [i]);
assert_eq!(postings_a.doc(), i);
}
assert!(!postings_a.advance());
assert_eq!(postings_a.advance(), TERMINATED);
}
{
let term_e = Term::from_field_text(text_field, "e");
@@ -313,7 +306,6 @@ pub mod tests {
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
for i in 2u32..1000u32 {
assert!(postings_e.advance());
assert_eq!(postings_e.term_freq(), i);
postings_e.positions(&mut positions);
assert_eq!(positions.len(), i as usize);
@@ -321,8 +313,9 @@ pub mod tests {
assert_eq!(positions[j], (j as u32));
}
assert_eq!(postings_e.doc(), i);
postings_e.advance();
}
assert!(!postings_e.advance());
assert_eq!(postings_e.doc(), TERMINATED);
}
}
}
@@ -336,16 +329,8 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let mut doc = Document::default();
doc.add_text(text_field, "g b b d c g c");
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "g a b b a d c g c");
index_writer.add_document(doc);
}
index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok());
}
let term_a = Term::from_field_text(text_field, "a");
@@ -355,7 +340,6 @@ pub mod tests {
.inverted_index(text_field)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 1u32);
postings.positions(&mut positions);
assert_eq!(&positions[..], &[1u32, 4]);
@@ -377,11 +361,8 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..num_docs {
let mut doc = Document::default();
doc.add_u64(value_field, 2);
doc.add_u64(value_field, (i % 2) as u64);
for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
@@ -398,11 +379,10 @@ pub mod tests {
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
assert_eq!(segment_postings.skip_next(j), SkipResult::Reached);
assert_eq!(segment_postings.seek(j), j);
assert_eq!(segment_postings.doc(), j);
}
}
@@ -414,17 +394,16 @@ pub mod tests {
.unwrap();
// check that `skip_next` advances the iterator
assert!(segment_postings.advance());
assert_eq!(segment_postings.doc(), 0);
assert_eq!(segment_postings.skip_next(1), SkipResult::Reached);
assert_eq!(segment_postings.seek(1), 1);
assert_eq!(segment_postings.doc(), 1);
assert_eq!(segment_postings.skip_next(1), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), 2);
assert_eq!(segment_postings.seek(1), 1);
assert_eq!(segment_postings.doc(), 1);
// check that going beyond the end is handled
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
assert_eq!(segment_postings.seek(num_docs), TERMINATED);
}
// check that filtering works
@@ -435,7 +414,7 @@ pub mod tests {
.unwrap();
for i in 0..num_docs / 2 {
assert_eq!(segment_postings.skip_next(i * 2), SkipResult::Reached);
assert_eq!(segment_postings.seek(i * 2), i * 2);
assert_eq!(segment_postings.doc(), i * 2);
}
@@ -445,7 +424,7 @@ pub mod tests {
.unwrap();
for i in 0..num_docs / 2 - 1 {
assert_eq!(segment_postings.skip_next(i * 2 + 1), SkipResult::OverStep);
assert!(segment_postings.seek(i * 2 + 1) > (i * 1) * 2);
assert_eq!(segment_postings.doc(), (i + 1) * 2);
}
}
@@ -457,6 +436,7 @@ pub mod tests {
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
// make sure seeking still works
@@ -467,11 +447,11 @@ pub mod tests {
.unwrap();
if i % 2 == 0 {
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
assert!(segment_reader.is_deleted(i));
} else {
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
}
}
@@ -486,12 +466,16 @@ pub mod tests {
let mut last = 2; // start from 5 to avoid seeking to 3 twice
let mut cur = 3;
loop {
match segment_postings.skip_next(cur) {
SkipResult::End => break,
SkipResult::Reached => assert_eq!(segment_postings.doc(), cur),
SkipResult::OverStep => assert_eq!(segment_postings.doc(), cur + 1),
let seek = segment_postings.seek(cur);
if seek == TERMINATED {
break;
}
assert_eq!(seek, segment_postings.doc());
if seek == cur {
assert_eq!(segment_postings.doc(), cur);
} else {
assert_eq!(segment_postings.doc(), cur + 1);
}
let next = cur + last;
last = cur;
cur = next;
@@ -577,7 +561,7 @@ pub mod tests {
}
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.0.advance()
}
@@ -603,30 +587,22 @@ pub mod tests {
for target in targets {
let mut postings_opt = postings_factory();
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.skip_next(target);
let skip_result_unopt = postings_unopt.skip_next(target);
let skip_result_opt = postings_opt.seek(target);
let skip_result_unopt = postings_unopt.seek(target);
assert_eq!(
skip_result_unopt, skip_result_opt,
"Failed while skipping to {}",
target
);
match skip_result_opt {
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
SkipResult::OverStep => assert!(postings_opt.doc() > target),
SkipResult::End => {
return;
}
assert!(skip_result_opt >= target);
assert_eq!(skip_result_opt, postings_opt.doc());
if skip_result_opt == TERMINATED {
return;
}
while postings_opt.advance() {
assert!(postings_unopt.advance());
assert_eq!(
postings_opt.doc(),
postings_unopt.doc(),
"Failed while skipping to {}",
target
);
while postings_opt.doc() != TERMINATED {
assert_eq!(postings_opt.doc(), postings_unopt.doc());
assert_eq!(postings_opt.advance(), postings_unopt.advance());
}
assert!(!postings_unopt.advance());
}
}
}
@@ -635,7 +611,7 @@ pub mod tests {
mod bench {
use super::tests::*;
use crate::docset::SkipResult;
use crate::docset::TERMINATED;
use crate::query::Intersection;
use crate::schema::IndexRecordOption;
use crate::tests;
@@ -653,7 +629,7 @@ mod bench {
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
while segment_postings.advance() {}
while segment_postings.advance() != TERMINATED {}
});
}
@@ -685,7 +661,7 @@ mod bench {
segment_postings_c,
segment_postings_d,
]);
while intersection.advance() {}
while intersection.advance() != TERMINATED {}
});
}
@@ -701,11 +677,10 @@ mod bench {
.unwrap();
let mut existing_docs = Vec::new();
segment_postings.advance();
for doc in &docs {
if *doc >= segment_postings.doc() {
existing_docs.push(*doc);
if segment_postings.skip_next(*doc) == SkipResult::End {
if segment_postings.seek(*doc) == TERMINATED {
break;
}
}
@@ -717,7 +692,7 @@ mod bench {
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
for doc in &existing_docs {
if segment_postings.skip_next(*doc) == SkipResult::End {
if segment_postings.seek(*doc) == TERMINATED {
break;
}
}
@@ -756,8 +731,9 @@ mod bench {
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
let mut s = 0u32;
while segment_postings.advance() {
while segment_postings.doc() != TERMINATED {
s += (segment_postings.doc() & n) % 1024;
segment_postings.advance()
}
s
});

View File

@@ -1,16 +1,19 @@
use crate::common::BitSet;
use crate::common::HasLen;
use crate::docset::{DocSet, SkipResult};
use crate::common::{BinarySerializable, VInt};
use crate::docset::{DocSet, TERMINATED};
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::serializer::PostingsSerializer;
use crate::postings::{BlockSearcher, BlockSegmentPostings};
use crate::postings::BlockSearcher;
use crate::postings::FreqReadingOption;
use crate::postings::Postings;
use crate::postings::SkipReader;
use crate::postings::USE_SKIP_INFO_LIMIT;
use crate::schema::IndexRecordOption;
use crate::DocId;
use owned_read::OwnedRead;
use std::cmp::Ordering;
use crate::postings::block_segment_postings::BlockSegmentPostingsSkipResult;
use tantivy_fst::Streamer;
struct PositionComputer {
// store the amount of position int
@@ -63,12 +66,14 @@ impl SegmentPostings {
/// Returns an empty segment postings object
pub fn empty() -> Self {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
let mut segment_postings = SegmentPostings {
block_cursor: empty_block_cursor,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
block_searcher: BlockSearcher::default(),
}
};
segment_postings.advance();
segment_postings
}
/// Creates a segment postings object with the given documents
@@ -111,12 +116,14 @@ impl SegmentPostings {
segment_block_postings: BlockSegmentPostings,
positions_stream_opt: Option<PositionReader>,
) -> SegmentPostings {
SegmentPostings {
let mut postings = SegmentPostings {
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(),
}
};
postings.advance();
postings
}
}
@@ -124,7 +131,7 @@ impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
let term_freq = self.term_freq() as usize;
if let Some(position_computer) = self.position_computer.as_mut() {
@@ -133,29 +140,19 @@ impl DocSet for SegmentPostings {
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
if self.block_cursor.advance() {
self.cur = 0;
} else {
self.cur = COMPRESSION_BLOCK_SIZE - 1;
return TERMINATED;
}
}
true
self.doc()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
}
match self.doc().cmp(&target) {
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
return SkipResult::OverStep;
}
_ => {
// ...
}
fn seek(&mut self, target: DocId) -> DocId {
if self.doc() >= target {
return self.doc();
}
// In the following, thanks to the call to advance above,
@@ -165,44 +162,44 @@ impl DocSet for SegmentPostings {
// skip blocks until one that might contain the target
// check if we need to go to the next block
let mut sum_freqs_skipped: u32 = 0;
if !self
if self
.block_cursor
.docs()
.last()
.map(|doc| *doc >= target)
.unwrap_or(false)
// there should always be at least a document in the block
// since advance returned.
.map(|&doc| doc < target)
.unwrap_or(true)
{
// we are not in the right block.
//
// First compute all of the freqs skipped from the current block.
// We are not in the right block.
if self.position_computer.is_some() {
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
// First compute all of the freqs skipped from the current block.
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum::<u32>();
match self.block_cursor.skip_to(target) {
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
sum_freqs_skipped += block_skip_freqs;
}
BlockSegmentPostingsSkipResult::Terminated => {
return SkipResult::End;
self.block_cursor.doc_decoder.clear();
self.cur = 0;
return TERMINATED;
}
}
} else if self.block_cursor.skip_to(target)
== BlockSegmentPostingsSkipResult::Terminated
{
// no positions needed. no need to sum freqs.
return SkipResult::End;
self.block_cursor.doc_decoder.clear();
self.cur = 0;
return TERMINATED;
}
self.cur = 0;
}
// At this point we are on the block, that might contain our document.
let cur = self.cur;
// we're in the right block now, start with an exponential search
let (output, len) = self.block_cursor.docs_aligned();
let new_cur = self
.block_searcher
.search_in_block(&output, len, cur, target);
let output = self.block_cursor.docs_aligned();
let new_cur = self.block_searcher.search_in_block(&output, cur, target);
if let Some(position_computer) = self.position_computer.as_mut() {
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
position_computer.add_skip(sum_freqs_skipped as usize);
@@ -212,11 +209,7 @@ impl DocSet for SegmentPostings {
// `doc` is now the first element >= `target`
let doc = output.0[new_cur];
debug_assert!(doc >= target);
if doc == target {
SkipResult::Reached
} else {
SkipResult::OverStep
}
doc
}
/// Return the current document's `DocId`.
@@ -226,18 +219,14 @@ impl DocSet for SegmentPostings {
/// Will panics if called without having called advance before.
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
);
docs[self.cur]
self.block_cursor.doc(self.cur)
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
/*
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block
if self.advance() {
@@ -252,6 +241,7 @@ impl DocSet for SegmentPostings {
}
}
}
*/
}
impl HasLen for SegmentPostings {
@@ -294,6 +284,314 @@ impl Postings for SegmentPostings {
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
doc_freq: usize,
doc_offset: DocId,
num_vint_docs: usize,
remaining_data: OwnedRead,
skip_reader: SkipReader,
}
fn split_into_skips_and_postings(
doc_freq: u32,
mut data: OwnedRead,
) -> (Option<OwnedRead>, OwnedRead) {
if doc_freq < USE_SKIP_INFO_LIMIT {
return (None, data);
}
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
let mut postings_data = data.clone();
postings_data.advance(skip_len);
data.clip(skip_len);
(Some(data), postings_data)
}
#[derive(Debug, Eq, PartialEq)]
pub enum BlockSegmentPostingsSkipResult {
Terminated,
Success(u32), //< number of term freqs to skip
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: u32,
data: OwnedRead,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
BlockSegmentPostings {
num_vint_docs,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
doc_offset: 0,
doc_freq,
remaining_data: postings_data,
skip_reader,
}
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data);
} else {
self.skip_reader.reset(OwnedRead::new(&[][..]))
}
self.doc_offset = 0;
self.doc_freq = doc_freq as usize;
}
/// Returns the document frequency associated to this block postings.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> usize {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.doc_decoder.output_array()
}
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.doc_decoder.output_len
}
/// position on a block that may contains `doc_id`.
/// Always advance the current block.
///
/// Returns true if a block that has an element greater or equal to the target is found.
/// Returning true does not guarantee that the smallest element of the block is smaller
/// than the target. It only guarantees that the last element is greater or equal.
///
/// Returns false iff all of the document remaining are smaller than
/// `doc_id`. In that case, all of these document are consumed.
///
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
let mut skip_freqs = 0u32;
while self.skip_reader.advance() {
if self.skip_reader.doc() >= target_doc {
// the last document of the current block is larger
// than the target.
//
// We found our block!
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
self.doc_offset = self.skip_reader.doc();
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
} else {
skip_freqs += self.skip_reader.tf_sum();
let advance_len = self.skip_reader.total_block_len();
self.doc_offset = self.skip_reader.doc();
self.remaining_data.advance(advance_len);
}
}
self.doc_decoder.clear();
if self.num_vint_docs == 0 {
return BlockSegmentPostingsSkipResult::Terminated;
}
// we are now on the last, incomplete, variable encoded block.
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
self.docs()
.last()
.map(|last_doc| {
if *last_doc >= target_doc {
BlockSegmentPostingsSkipResult::Success(skip_freqs)
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated)
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.skip_reader.advance() {
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
return true;
}
self.doc_decoder.clear();
if self.num_vint_docs == 0 {
return false;
}
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
true
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_vint_docs: 0,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
doc_offset: 0,
doc_freq: 0,
remaining_data: OwnedRead::new(vec![]),
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
}
}
}
impl<'b> Streamer<'b> for BlockSegmentPostings {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
if self.advance() {
Some(self.docs())
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::BlockSegmentPostings;
@@ -301,34 +599,34 @@ mod tests {
use super::SegmentPostings;
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::postings::Postings;
use crate::schema::IndexRecordOption;
use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
use crate::SkipResult;
use tantivy_fst::Streamer;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert!(!postings.advance());
assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.len(), 0);
}
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_doc_called_before_advance() {
SegmentPostings::empty().doc();
fn test_empty_postings_doc_returns_terminated() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_freq_called_before_advance() {
SegmentPostings::empty().term_freq();
fn test_empty_postings_doc_term_freq_returns_0() {
let postings = SegmentPostings::empty();
assert_eq!(postings.term_freq(), 1);
}
#[test]
@@ -362,25 +660,27 @@ mod tests {
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.advance(), 130);
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.advance(), 130);
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED);
}
}

View File

@@ -1,6 +1,6 @@
use crate::core::Searcher;
use crate::core::SegmentReader;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
@@ -25,7 +25,6 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer {
state: State::NotStarted,
doc: 0u32,
max_doc: reader.max_doc(),
};
@@ -40,39 +39,20 @@ impl Weight for AllWeight {
}
}
enum State {
NotStarted,
Started,
Finished,
}
/// Scorer associated to the `AllQuery` query.
pub struct AllScorer {
state: State,
doc: DocId,
max_doc: DocId,
}
impl DocSet for AllScorer {
fn advance(&mut self) -> bool {
match self.state {
State::NotStarted => {
self.state = State::Started;
self.doc = 0;
}
State::Started => {
self.doc += 1u32;
}
State::Finished => {
return false;
}
}
if self.doc < self.max_doc {
true
} else {
self.state = State::Finished;
false
fn advance(&mut self) -> DocId {
if self.doc + 1 >= self.max_doc {
self.doc = TERMINATED;
return TERMINATED;
}
self.doc += 1;
self.doc
}
fn doc(&self) -> DocId {
@@ -93,6 +73,7 @@ impl Scorer for AllScorer {
#[cfg(test)]
mod tests {
use super::AllQuery;
use crate::docset::TERMINATED;
use crate::query::Query;
use crate::schema::{Schema, TEXT};
use crate::Index;
@@ -120,18 +101,16 @@ mod tests {
{
let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0f32).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(scorer.advance());
assert_eq!(scorer.advance(), 1u32);
assert_eq!(scorer.doc(), 1u32);
assert!(!scorer.advance());
assert_eq!(scorer.advance(), TERMINATED);
}
{
let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader, 1.0f32).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(!scorer.advance());
assert_eq!(scorer.advance(), TERMINATED);
}
}
@@ -144,13 +123,11 @@ mod tests {
let reader = searcher.segment_reader(0);
{
let mut scorer = weight.scorer(reader, 2.0f32).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 2.0f32);
}
{
let mut scorer = weight.scorer(reader, 1.5f32).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.5f32);
}

View File

@@ -6,8 +6,8 @@ use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::DocId;
use crate::Result;
use crate::TantivyError;
use crate::{Result, SkipResult};
use std::sync::Arc;
use tantivy_fst::Automaton;
@@ -64,7 +64,7 @@ where
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0f32)?;
if scorer.skip_next(doc) == SkipResult::Reached {
if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0f32))
} else {
Err(TantivyError::InvalidArgument(
@@ -77,6 +77,7 @@ where
#[cfg(test)]
mod tests {
use super::AutomatonWeight;
use crate::docset::TERMINATED;
use crate::query::Weight;
use crate::schema::{Schema, STRING};
use crate::Index;
@@ -141,13 +142,12 @@ mod tests {
let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.0f32);
assert!(scorer.advance());
assert_eq!(scorer.advance(), 2u32);
assert_eq!(scorer.doc(), 2u32);
assert_eq!(scorer.score(), 1.0f32);
assert!(!scorer.advance());
assert_eq!(scorer.advance(), TERMINATED);
}
#[test]
@@ -160,7 +160,6 @@ mod tests {
let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.32f32)
.unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.32f32);
}

View File

@@ -1,7 +1,6 @@
use crate::common::{BitSet, TinySet};
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use std::cmp::Ordering;
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
///
@@ -33,75 +32,50 @@ impl From<BitSet> for BitSetDocSet {
} else {
docs.tinyset(0)
};
BitSetDocSet {
let mut docset = BitSetDocSet {
docs,
cursor_bucket: 0,
cursor_tinybitset: first_tiny_bitset,
doc: 0u32,
}
};
docset.advance();
docset
}
}
impl DocSet for BitSetDocSet {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
return true;
return self.doc;
}
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
self.go_to_bucket(cursor_bucket);
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
self.doc = (cursor_bucket * 64u32) | lower;
true
self.doc
} else {
false
self.doc = TERMINATED;
TERMINATED
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
// skip is required to advance.
if !self.advance() {
return SkipResult::End;
}
fn seek(&mut self, target: DocId) -> DocId {
let target_bucket = target / 64u32;
// Mask for all of the bits greater or equal
// to our target document.
match target_bucket.cmp(&self.cursor_bucket) {
Ordering::Greater => {
self.go_to_bucket(target_bucket);
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
if !self.advance() {
SkipResult::End
} else if self.doc() == target {
SkipResult::Reached
} else {
debug_assert!(self.doc() > target);
SkipResult::OverStep
}
}
Ordering::Equal => loop {
match self.doc().cmp(&target) {
Ordering::Less => {
if !self.advance() {
return SkipResult::End;
}
}
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
debug_assert!(self.doc() > target);
return SkipResult::OverStep;
}
}
},
Ordering::Less => {
debug_assert!(self.doc() > target);
SkipResult::OverStep
}
if target_bucket > self.cursor_bucket {
self.go_to_bucket(target_bucket);
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
self.advance();
}
let mut doc = self.doc();
while doc < target {
doc = self.advance();
}
doc
}
/// Returns the current document
@@ -122,7 +96,7 @@ impl DocSet for BitSetDocSet {
mod tests {
use super::BitSetDocSet;
use crate::common::BitSet;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
@@ -133,19 +107,24 @@ mod tests {
BitSetDocSet::from(docset)
}
#[test]
fn test_empty() {
let bitset = BitSet::with_max_value(1000);
let mut empty = BitSetDocSet::from(bitset);
assert_eq!(empty.advance(), TERMINATED)
}
fn test_go_through_sequential(docs: &[DocId]) {
let mut docset = create_docbitset(docs, 1_000u32);
for &doc in docs {
assert!(docset.advance());
assert_eq!(doc, docset.doc());
docset.advance();
}
assert!(!docset.advance());
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
#[test]
fn test_docbitset_sequential() {
test_go_through_sequential(&[]);
test_go_through_sequential(&[1, 2, 3]);
test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]);
test_go_through_sequential(&[63, 64, 65]);
@@ -156,64 +135,64 @@ mod tests {
fn test_docbitset_skip() {
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.skip_next(7), SkipResult::Reached);
assert_eq!(docset.seek(7), 7);
assert_eq!(docset.doc(), 7);
assert!(docset.advance(), 7);
assert_eq!(docset.advance(), 5112);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.skip_next(3), SkipResult::OverStep);
assert_eq!(docset.seek(3), 5);
assert_eq!(docset.doc(), 5);
assert!(docset.advance());
assert_eq!(docset.advance(), 6);
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
assert_eq!(docset.seek(5112), 5112);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5113), SkipResult::End);
assert!(!docset.advance());
assert_eq!(docset.seek(5113), TERMINATED);
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.seek(5111), 5112);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
assert_eq!(docset.seek(5112), 5112);
assert_eq!(docset.doc(), 5112);
assert!(docset.advance());
assert_eq!(docset.advance(), 5500);
assert_eq!(docset.doc(), 5500);
assert!(docset.advance());
assert_eq!(docset.advance(), 6666);
assert_eq!(docset.doc(), 6666);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.seek(5111), 5112);
assert_eq!(docset.doc(), 5112);
assert!(docset.advance());
assert_eq!(docset.advance(), 5500);
assert_eq!(docset.doc(), 5500);
assert!(docset.advance());
assert_eq!(docset.advance(), 6666);
assert_eq!(docset.doc(), 6666);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000);
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.seek(5111), 5112);
assert_eq!(docset.doc(), 5112);
assert!(docset.advance());
assert_eq!(docset.advance(), 5513);
assert_eq!(docset.doc(), 5513);
assert!(docset.advance());
assert_eq!(docset.advance(), 6666);
assert_eq!(docset.doc(), 6666);
assert!(!docset.advance());
assert_eq!(docset.advance(), TERMINATED);
}
}
}
@@ -223,6 +202,7 @@ mod bench {
use super::BitSet;
use super::BitSetDocSet;
use crate::docset::TERMINATED;
use crate::test;
use crate::tests;
use crate::DocSet;
@@ -257,7 +237,7 @@ mod bench {
}
b.iter(|| {
let mut docset = BitSetDocSet::from(bitset.clone());
while docset.advance() {}
while docset.advance() != TERMINATED {}
});
}
}

View File

@@ -1,31 +0,0 @@
use crate::docset::DocSet;
use crate::DocId;
use crate::Score;
use downcast_rs::impl_downcast;
use crate::query::Scorer;
/// A set of documents matching a query within a specific segment
/// and having a maximum score within certain blocks.
///
/// See [`Query`](./trait.Query.html) and [`Scorer`](./trait.Scorer.html).
pub trait BlockMaxScorer: downcast_rs::Downcast + DocSet + Scorer + 'static {
/// Returns the maximum score within the current block.
///
/// The blocks are defined when indexing. For example, blocks can be
/// have a specific number postings each, or can be optimized for
/// retrieval speed. Read more in
/// [Faster BlockMax WAND with Variable-sized Blocks][vbmw]
///
/// This method will perform a bit of computation and is not cached.
///
/// [vbmw]: https://dl.acm.org/doi/abs/10.1145/3077136.3080780
fn block_max_score(&mut self) -> Score;
/// Returns the last document in the current block.
fn block_max_doc(&mut self) -> DocId;
/// Returns the maximum possible score within the entire document set.
fn max_score(&self) -> Score;
}
impl_downcast!(BlockMaxScorer);

View File

@@ -1,613 +0,0 @@
use crate::docset::{DocSet, SkipResult};
use crate::query::score_combiner::ScoreCombiner;
use crate::query::{BlockMaxScorer, Scorer};
use crate::DocId;
use crate::Score;
use crate::query::weight::PruningScorer;
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
struct Pivot {
position: usize,
first_occurrence: usize,
doc: DocId,
}
/// Find the position in the sorted list of posting lists of the **pivot**.
///
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
///
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
fn find_pivot_position<'a, TScorer>(
mut docsets: impl Iterator<Item = &'a TScorer>,
lower_bound_score: Score,
) -> Option<Pivot>
where TScorer: BlockMaxScorer
{
let mut position = 0;
let mut upper_bound = Score::default();
while let Some(docset) = docsets.next() {
upper_bound += docset.max_score();
if lower_bound_score < upper_bound {
let pivot_doc = docset.doc();
let first_occurrence = position;
while let Some(docset) = docsets.next() {
if docset.doc() != pivot_doc {
break;
} else {
position += 1;
}
}
return Some(Pivot {
position,
doc: pivot_doc,
first_occurrence,
});
}
position += 1;
}
None
}
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
/// scores.
fn find_next_relevant_doc<T, TScorer>(
docsets_up_to_pivot: &mut [T],
pivot_docset: &mut T,
docset_after_pivot: Option<&mut T>,
) -> DocId
where
T: AsMut<TScorer>,
TScorer: BlockMaxScorer + Scorer,
{
let pivot_docset = pivot_docset.as_mut();
let mut next_doc = 1 + docsets_up_to_pivot
.iter_mut()
.map(|docset| docset.as_mut().block_max_doc())
.chain(std::iter::once(pivot_docset.block_max_doc()))
.min()
.unwrap();
if let Some(docset) = docset_after_pivot {
let doc = docset.as_mut().doc();
if doc < next_doc {
next_doc = doc;
}
}
if next_doc <= pivot_docset.doc() {
pivot_docset.doc() + 1
} else {
next_doc
}
}
/// Sifts down the first element of the slice.
///
/// `docsets[1..]` are assumed sorted.
/// This function swaps `docsets[0]` with its right
/// neighbor successively -bubble sort style- until it reaches the first
/// position such that `docsets` is sorted.
fn sift_down<T, TScorer>(docsets: &mut [T])
where
T: AsRef<TScorer>,
TScorer: BlockMaxScorer + Scorer,
{
for idx in 1..docsets.len() {
if docsets[idx].as_ref().doc() >= docsets[idx - 1].as_ref().doc() {
return;
}
docsets.swap(idx, idx - 1);
}
}
/// Creates a `DocSet` that iterates through the union of two or more `DocSet`s,
/// applying [BlockMaxWand] dynamic pruning.
///
/// [BlockMaxWand]: https://dl.acm.org/doi/10.1145/2009916.2010048
pub struct BlockMaxWand<TScorer, TScoreCombiner> {
docsets: Vec<Box<TScorer>>,
doc: DocId,
score: Score,
combiner: TScoreCombiner,
}
impl<TScorer, TScoreCombiner> BlockMaxWand<TScorer, TScoreCombiner>
where
TScoreCombiner: ScoreCombiner,
TScorer: BlockMaxScorer + Scorer,
{
fn new(
docsets: Vec<TScorer>,
combiner: TScoreCombiner,
) -> BlockMaxWand<TScorer, TScoreCombiner> {
let mut non_empty_docsets: Vec<_> = docsets
.into_iter()
.flat_map(|mut docset| {
if docset.advance() {
Some(Box::new(docset))
} else {
None
}
})
.collect();
non_empty_docsets.sort_by_key(Box::<TScorer>::doc);
BlockMaxWand {
docsets: non_empty_docsets,
combiner,
doc: 0u32,
score: 0f32
}
}
/// Find the position in the sorted list of posting lists of the **pivot**.
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
find_pivot_position(
self.docsets.iter().map(|docset| docset.as_ref()),
lower_bound_score)
}
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
let block_upper_bound: Score = self.docsets[..=pivot.position]
.iter_mut()
.map(|docset| docset.block_max_score())
.sum();
if block_upper_bound > lower_bound_score {
if pivot.doc == self.docsets[0].doc() {
// Since self.docsets is sorted by their current doc, in this branch, all
// docsets in [0..=pivot] are positioned on pivot.doc.
//
// Lets compute the actual score for this doc.
//
// NOTE(elshize): One additional check needs to be done to improve performance:
// update block-wise bound while accumulating score with the actual score,
// and check each time if still above threshold.
self.combiner.clear();
for idx in (0..=pivot.position).rev() {
self.combiner.update(self.docsets[idx].as_mut());
if !self.docsets[idx].advance() {
self.docsets.swap_remove(idx);
}
}
self.score = self.combiner.score();
self.doc = pivot.doc;
self.docsets.sort_by_key(Box::<TScorer>::doc);
SkipResult::Reached
} else {
// The substraction does not underflow because otherwise we would go to the other
// branch.
//
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
let advanced_idx = pivot.first_occurrence - 1;
if !self.docsets[advanced_idx].advance() {
self.docsets.swap_remove(advanced_idx);
}
if self.docsets.is_empty() {
return SkipResult::End;
}
sift_down(&mut self.docsets[advanced_idx..]);
SkipResult::OverStep
}
} else {
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
// NOTE(elshize): It might be more efficient to advance the list with the higher
// max score, but let's advance the first one for now for simplicity.
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
self.docsets.swap_remove(0);
}
if self.docsets.is_empty() {
return SkipResult::End;
}
sift_down(&mut self.docsets[..]);
SkipResult::OverStep
}
}
}
impl<TScorer, TScoreCombiner> PruningScorer
for BlockMaxWand<TScorer, TScoreCombiner>
where
TScoreCombiner: ScoreCombiner,
TScorer: Scorer + BlockMaxScorer,
{
fn doc(&self) -> DocId {
self.doc
}
fn score(&self) -> Score {
self.score
}
fn advance_with_pruning(&mut self, lower_bound_score: f32) -> bool {
while let Some(pivot) = self.find_pivot_position(lower_bound_score) {
match self.advance_with_pivot(pivot, lower_bound_score) {
SkipResult::End => { return false },
SkipResult::Reached=> { return true; }
SkipResult::OverStep => {}
}
}
false
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::common::HasLen;
use crate::docset::DocSet;
use crate::query::score_combiner::SumCombiner;
use crate::query::Union;
use crate::query::{BlockMaxScorer, Scorer};
use crate::{DocId, Score};
use float_cmp::approx_eq;
use proptest::strategy::Strategy;
use std::cmp::Ordering;
use std::num::Wrapping;
use crate::collector::{SegmentCollector, TopScoreSegmentCollector};
/*
#[derive(Debug, Clone)]
pub struct VecDocSet {
postings: Vec<(DocId, Score)>,
cursor: Wrapping<usize>,
block_max_scores: Vec<(DocId, Score)>,
max_score: Score,
block_size: usize,
}
impl VecDocSet {
fn new(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
let block_max_scores: Vec<(DocId, f32)> = postings
.chunks(block_size)
.into_iter()
.map(|block| {
(
block.iter().last().unwrap().0,
block
.iter()
.map(|(_, s)| *s)
.fold(-f32::INFINITY, |left, right| left.max(right))
)
})
.collect();
let max_score = block_max_scores
.iter()
.copied()
.map(|(_, s)| s)
.fold(-f32::INFINITY, |left, right| left.max(right));
VecDocSet {
postings,
cursor: Wrapping(0_usize) - Wrapping(1_usize),
block_max_scores,
max_score,
block_size,
}
}
/// Constructs a new set and advances it.
fn started(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
let mut docset = VecDocSet::new(postings, block_size);
docset.advance();
docset
}
}
impl DocSet for VecDocSet {
fn advance(&mut self) -> bool {
self.cursor += Wrapping(1);
self.postings.len() > self.cursor.0
}
fn doc(&self) -> DocId {
self.postings[self.cursor.0].0
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
}
impl HasLen for VecDocSet {
fn len(&self) -> usize {
self.postings.len()
}
}
impl BlockMaxScorer for VecDocSet {
fn max_score(&self) -> Score {
self.max_score
}
fn block_max_score(&mut self) -> Score {
self.block_max_scores[self.cursor.0 / self.block_size].1
}
fn block_max_doc(&mut self) -> DocId {
self.block_max_scores[self.cursor.0 / self.block_size].0
}
}
impl Scorer for VecDocSet {
fn score(&mut self) -> Score {
self.postings[self.cursor.0].1
}
}
#[derive(Debug)]
struct ComparableDoc<T, D> {
feature: T,
doc: D,
}
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
// Reversed to make BinaryHeap work as a min-heap
let by_feature = other
.feature
.partial_cmp(&self.feature)
.unwrap_or(Ordering::Equal);
let lazy_by_doc_address =
|| self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
// In case of a tie on the feature, we sort by ascending
// `DocAddress` in order to ensure a stable sorting of the
// documents.
by_feature.then_with(lazy_by_doc_address)
}
}
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
fn union_vs_bmw(posting_lists: Vec<VecDocSet>) {
let mut union = Union::<VecDocSet, SumCombiner>::from(posting_lists.clone());
let mut top_union = TopScoreSegmentCollector::new(0, 10);
while union.advance() {
top_union.collect(union.doc(), union.score());
}
let top_bmw = TopScoreSegmentCollector::new(0, 10 );
let mut bmw = BlockMaxWand::new(posting_lists, SumCombiner::default());
let top_docs_bnw = top_bmw.collect_scorer(&mut bmw, None);
for ((expected_score, expected_doc), (actual_score, actual_doc)) in
top_union.harvest().into_iter().zip( top_docs_bnw )
{
assert!(approx_eq!(
f32,
expected_score,
actual_score,
epsilon = 0.0001
));
assert_eq!(expected_doc, actual_doc);
}
}
#[test]
fn test_bmw_0() {
union_vs_bmw(vec![
VecDocSet {
postings: vec![
(0, 1.0),
(23, 1.0),
(28, 1.0),
(56, 1.0),
(59, 1.0),
(66, 1.0),
(93, 1.0),
],
cursor: Wrapping(0_usize) - Wrapping(1_usize),
block_max_scores: vec![(93, 1.0)],
max_score: 1.0,
block_size: 16,
},
VecDocSet {
postings: vec![
(2, 1.6549665),
(43, 2.6958032),
(53, 3.5309567),
(71, 2.7688136),
(87, 3.4279852),
(96, 3.9028034),
],
cursor: Wrapping(0_usize) - Wrapping(1_usize),
block_max_scores: vec![(96, 3.9028034)],
max_score: 3.9028034,
block_size: 16,
},
])
}
#[test]
fn test_bmw_1() {
union_vs_bmw(vec![
VecDocSet {
postings: vec![(73, 1.0), (82, 1.0)],
cursor: Wrapping(0_usize) - Wrapping(1_usize),
block_max_scores: vec![(82, 1.0)],
max_score: 1.0,
block_size: 16,
},
VecDocSet {
postings: vec![
(21, 3.582513),
(23, 1.6928024),
(27, 3.887647),
(42, 1.5469292),
(61, 1.7317574),
(62, 1.2968783),
(82, 2.4040694),
(85, 3.1487892),
],
cursor: Wrapping(0_usize) - Wrapping(1_usize),
block_max_scores: vec![(85, 3.887647)],
max_score: 3.887647,
block_size: 16,
},
])
}
proptest::proptest! {
#[test]
fn test_union_vs_bmw(postings in proptest::collection::vec(
proptest::collection::vec(0_u32..100, 1..10)
.prop_flat_map(|v| {
let scores = proptest::collection::vec(1_f32..4_f32, v.len()..=v.len());
scores.prop_map(move |s| {
let mut postings: Vec<_> = v.iter().copied().zip(s.iter().copied()).collect();
postings.sort_by_key(|p| p.0);
postings.dedup_by_key(|p| p.0);
VecDocSet::new(postings, 16)
})
}),
2..5)
) {
union_vs_bmw(postings);
}
}
#[test]
fn test_find_pivot_position() {
let postings = vec![
VecDocSet::started(vec![(0, 2.0)], 1),
VecDocSet::started(vec![(1, 3.0)], 1),
VecDocSet::started(vec![(2, 4.0)], 1),
VecDocSet::started(vec![(3, 5.0)], 1),
VecDocSet::started(vec![(3, 6.0)], 1),
];
assert_eq!(
find_pivot_position(postings.iter(), 2.0f32),
Some(Pivot {
position: 1,
doc: 1,
first_occurrence: 1,
})
);
assert_eq!(
find_pivot_position(postings.iter(), 5.0f32),
Some(Pivot {
position: 2,
doc: 2,
first_occurrence: 2,
})
);
assert_eq!(
find_pivot_position(postings.iter(), 9.0f32),
Some(Pivot {
position: 4,
doc: 3,
first_occurrence: 3,
})
);
assert_eq!(
find_pivot_position(postings.iter(), 20.0f32),
None
);
}
#[test]
fn test_find_next_relevant_doc_before_pivot() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(2, 0.0), (6, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
];
let (up_to_pivot, rest) = postings.split_at_mut(2);
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
assert_eq!(next_doc, 4);
}
#[test]
fn test_find_next_relevant_doc_prefix_smaller_than_pivot() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(5, 0.0), (8, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
];
let (up_to_pivot, rest) = postings.split_at_mut(2);
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
assert_eq!(next_doc, 6);
}
#[test]
fn test_find_next_relevant_doc_after_pivot() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
];
let (up_to_pivot, rest) = postings.split_at_mut(2);
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
assert_eq!(next_doc, 5);
}
#[test]
fn test_sift_down_already_sifted() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
];
sift_down(&mut postings[2..]);
assert_eq!(
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
vec![0, 1, 2, 5, 6]
);
}
#[test]
fn test_sift_down_sift_one_down() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(7, 0.0), (7, 0.0)], 2)),
];
sift_down(&mut postings[2..]);
assert_eq!(
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
vec![0, 1, 5, 6, 7]
);
}
#[test]
fn test_sift_down_to_bottom() {
let mut postings = vec![
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(7, 0.0), (8, 0.0)], 2)), // pivot
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
];
sift_down(&mut postings[2..]);
assert_eq!(
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
vec![0, 1, 5, 6, 7]
);
}
*/
}

View File

@@ -10,7 +10,7 @@ use crate::query::Scorer;
use crate::query::Union;
use crate::query::Weight;
use crate::query::{intersect_scorers, Explanation};
use crate::{DocId, SkipResult};
use crate::DocId;
use std::collections::HashMap;
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer>
@@ -133,7 +133,7 @@ impl Weight for BooleanWeight {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0f32)?;
if scorer.skip_next(doc) != SkipResult::Reached {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
if !self.scoring_enabled {

View File

@@ -207,7 +207,6 @@ mod tests {
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap();
assert!(boolean_scorer.advance());
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
}
@@ -215,7 +214,6 @@ mod tests {
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0f32)
.unwrap();
assert!(boolean_scorer.advance());
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
}

View File

@@ -1,8 +1,7 @@
use crate::common::BitSet;
use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Searcher, SegmentReader, SkipResult, Term};
use crate::{DocId, DocSet, Searcher, SegmentReader, Term};
use std::collections::BTreeSet;
use std::fmt;
@@ -72,7 +71,7 @@ impl Weight for BoostWeight {
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0f32)?;
if scorer.skip_next(doc) != SkipResult::Reached {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let mut explanation =
@@ -99,12 +98,12 @@ impl<S: Scorer> BoostScorer<S> {
}
impl<S: Scorer> DocSet for BoostScorer<S> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.underlying.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.underlying.skip_next(target)
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
@@ -119,10 +118,6 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.size_hint()
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
self.underlying.append_to_bitset(bitset)
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
self.underlying.count(delete_bitset)
}

View File

@@ -1,4 +1,5 @@
use super::Scorer;
use crate::docset::TERMINATED;
use crate::query::explanation::does_not_match;
use crate::query::Weight;
use crate::query::{Explanation, Query};
@@ -48,15 +49,12 @@ impl Weight for EmptyWeight {
pub struct EmptyScorer;
impl DocSet for EmptyScorer {
fn advance(&mut self) -> bool {
false
fn advance(&mut self) -> DocId {
TERMINATED
}
fn doc(&self) -> DocId {
panic!(
"You may not call .doc() on a scorer \
where the last call to advance() did not return true."
);
TERMINATED
}
fn size_hint(&self) -> u32 {
@@ -72,18 +70,15 @@ impl Scorer for EmptyScorer {
#[cfg(test)]
mod tests {
use crate::docset::TERMINATED;
use crate::query::EmptyScorer;
use crate::DocSet;
#[test]
fn test_empty_scorer() {
let mut empty_scorer = EmptyScorer;
assert!(!empty_scorer.advance());
}
#[test]
#[should_panic]
fn test_empty_scorer_panic_on_doc_call() {
EmptyScorer.doc();
assert_eq!(empty_scorer.doc(), TERMINATED);
assert_eq!(empty_scorer.advance(), TERMINATED);
assert_eq!(empty_scorer.doc(), TERMINATED);
}
}

View File

@@ -1,41 +1,37 @@
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::query::Scorer;
use crate::DocId;
use crate::Score;
#[derive(Clone, Copy, Debug)]
enum State {
ExcludeOne(DocId),
Finished,
}
/// Filters a given `DocSet` by removing the docs from a given `DocSet`.
///
/// The excluding docset has no impact on scoring.
pub struct Exclude<TDocSet, TDocSetExclude> {
underlying_docset: TDocSet,
excluding_docset: TDocSetExclude,
excluding_state: State,
}
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
where
TDocSet: DocSet,
TDocSetExclude: DocSet,
{
/// Creates a new `ExcludeScorer`
pub fn new(
underlying_docset: TDocSet,
mut underlying_docset: TDocSet,
mut excluding_docset: TDocSetExclude,
) -> Exclude<TDocSet, TDocSetExclude> {
let state = if excluding_docset.advance() {
State::ExcludeOne(excluding_docset.doc())
} else {
State::Finished
};
while underlying_docset.doc() != TERMINATED {
let target = underlying_docset.doc();
if excluding_docset.seek(target) != target {
// this document is not excluded.
break;
}
underlying_docset.advance();
}
Exclude {
underlying_docset,
excluding_docset,
excluding_state: state,
}
}
}
@@ -51,28 +47,7 @@ where
/// increasing `doc`.
fn accept(&mut self) -> bool {
let doc = self.underlying_docset.doc();
match self.excluding_state {
State::ExcludeOne(excluded_doc) => {
if doc == excluded_doc {
return false;
}
if excluded_doc > doc {
return true;
}
match self.excluding_docset.skip_next(doc) {
SkipResult::OverStep => {
self.excluding_state = State::ExcludeOne(self.excluding_docset.doc());
true
}
SkipResult::End => {
self.excluding_state = State::Finished;
true
}
SkipResult::Reached => false,
}
}
State::Finished => true,
}
self.excluding_docset.seek(doc) != doc
}
}
@@ -81,27 +56,24 @@ where
TDocSet: DocSet,
TDocSetExclude: DocSet,
{
fn advance(&mut self) -> bool {
while self.underlying_docset.advance() {
fn advance(&mut self) -> DocId {
while self.underlying_docset.advance() != TERMINATED {
if self.accept() {
return true;
return self.doc();
}
}
false
TERMINATED
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
let underlying_skip_result = self.underlying_docset.skip_next(target);
if underlying_skip_result == SkipResult::End {
return SkipResult::End;
fn seek(&mut self, target: DocId) -> DocId {
let underlying_seek_result = self.underlying_docset.seek(target);
if underlying_seek_result == TERMINATED {
return TERMINATED;
}
if self.accept() {
underlying_skip_result
} else if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
return underlying_seek_result;
}
self.advance()
}
fn doc(&self) -> DocId {
@@ -141,8 +113,9 @@ mod tests {
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
);
let mut els = vec![];
while exclude_scorer.advance() {
while exclude_scorer.doc() != TERMINATED {
els.push(exclude_scorer.doc());
exclude_scorer.advance();
}
assert_eq!(els, vec![5, 8, 15]);
}

View File

@@ -117,7 +117,7 @@ impl FuzzyTermQuery {
}
}
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
/// Creates a new Fuzzy Query of the Term prefix
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
@@ -188,6 +188,8 @@ mod test {
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
// passes because Levenshtein distance is 1 (substitute 'o' with 'a')
{
let term = Term::from_field_text(country_field, "japon");
@@ -200,6 +202,18 @@ mod test {
assert_nearly_equals(1f32, score);
}
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
{
let term = Term::from_field_text(country_field, "jap");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 0, "Expected no document");
}
// passes because prefix Levenshtein distance is 0
{
let term = Term::from_field_text(country_field, "jap");

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::query::term_query::TermScorer;
use crate::query::EmptyScorer;
use crate::query::Scorer;
@@ -20,12 +20,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
if scorers.len() == 1 {
return scorers.pop().unwrap();
}
scorers.sort_by_key(|scorer| scorer.size_hint());
let doc = go_to_first_doc(&mut scorers[..]);
if doc == TERMINATED {
return Box::new(EmptyScorer);
}
// We know that we have at least 2 elements.
let num_docsets = scorers.len();
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
let left = scorers.pop().unwrap();
let right = scorers.pop().unwrap();
scorers.reverse();
let left = scorers.remove(0);
let right = scorers.remove(0);
let all_term_scorers = [&left, &right]
.iter()
.all(|&scorer| scorer.is::<TermScorer>());
@@ -34,14 +36,12 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docsets,
});
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
})
}
@@ -50,22 +50,34 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
left: TDocSet,
right: TDocSet,
others: Vec<TOtherDocSet>,
num_docsets: usize,
}
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
let mut candidate = 0;
'outer: loop {
for docset in docsets.iter_mut() {
let seek_doc = docset.seek(candidate);
if seek_doc > candidate {
candidate = docset.doc();
continue 'outer;
}
}
return candidate;
}
}
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> {
let num_docsets = docsets.len();
assert!(num_docsets >= 2);
docsets.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
let left = docsets.pop().unwrap();
let right = docsets.pop().unwrap();
docsets.reverse();
docsets.sort_by_key(|docset| docset.size_hint());
go_to_first_doc(&mut docsets);
let left = docsets.remove(0);
let right = docsets.remove(0);
Intersection {
left,
right,
others: docsets,
num_docsets,
}
}
}
@@ -80,128 +92,49 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
}
}
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut dyn DocSet {
match ord {
0 => &mut self.left,
1 => &mut self.right,
n => &mut self.others[n - 2],
}
}
}
impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
let (left, right) = (&mut self.left, &mut self.right);
if !left.advance() {
return false;
}
let mut candidate = left.doc();
let mut other_candidate_ord: usize = usize::max_value();
let mut candidate = left.advance();
'outer: loop {
// In the first part we look for a document in the intersection
// of the two rarest `DocSet` in the intersection.
loop {
match right.skip_next(candidate) {
SkipResult::Reached => {
break;
}
SkipResult::OverStep => {
candidate = right.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
}
match left.skip_next(candidate) {
SkipResult::Reached => {
break;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
let right_doc = right.seek(candidate);
candidate = left.seek(right_doc);
if candidate == right_doc {
break;
}
}
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers;
for (ord, docset) in self.others.iter_mut().enumerate() {
if ord == other_candidate_ord {
continue;
}
for docset in self.others.iter_mut() {
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
}
continue 'outer;
}
SkipResult::End => {
return false;
}
let seek_doc = docset.seek(candidate);
if seek_doc > candidate {
candidate = left.seek(seek_doc);
continue 'outer;
}
}
return true;
return candidate;
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
// We optimize skipping by skipping every single member
// of the intersection to target.
let mut current_target: DocId = target;
let mut current_ord = self.num_docsets;
'outer: loop {
for ord in 0..self.num_docsets {
let docset = self.docset_mut(ord);
if ord == current_ord {
continue;
}
match docset.skip_next(current_target) {
SkipResult::End => {
return SkipResult::End;
}
SkipResult::OverStep => {
// update the target
// for the remaining members of the intersection.
current_target = docset.doc();
current_ord = ord;
continue 'outer;
}
SkipResult::Reached => {}
}
}
if target == current_target {
return SkipResult::Reached;
} else {
assert!(current_target > target);
return SkipResult::OverStep;
}
fn seek(&mut self, target: DocId) -> DocId {
self.left.seek(target);
let mut docsets: Vec<&mut dyn DocSet> = vec![&mut self.left, &mut self.right];
for docset in &mut self.others {
docsets.push(docset);
}
go_to_first_doc(&mut docsets[..])
}
fn doc(&self) -> DocId {
@@ -228,7 +161,7 @@ where
#[cfg(test)]
mod tests {
use super::Intersection;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::VecDocSet;
@@ -238,20 +171,18 @@ mod tests {
let left = VecDocSet::from(vec![1, 3, 9]);
let right = VecDocSet::from(vec![3, 4, 9, 18]);
let mut intersection = Intersection::new(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 3);
assert!(intersection.advance());
assert_eq!(intersection.advance(), 9);
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
assert_eq!(intersection.advance(), TERMINATED);
}
{
let a = VecDocSet::from(vec![1, 3, 9]);
let b = VecDocSet::from(vec![3, 4, 9, 18]);
let c = VecDocSet::from(vec![1, 5, 9, 111]);
let mut intersection = Intersection::new(vec![a, b, c]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
assert_eq!(intersection.advance(), TERMINATED);
}
}
@@ -260,8 +191,8 @@ mod tests {
let left = VecDocSet::from(vec![0]);
let right = VecDocSet::from(vec![0]);
let mut intersection = Intersection::new(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 0);
assert_eq!(intersection.advance(), TERMINATED);
}
#[test]
@@ -269,7 +200,7 @@ mod tests {
let left = VecDocSet::from(vec![0, 1, 2, 4]);
let right = VecDocSet::from(vec![2, 5]);
let mut intersection = Intersection::new(vec![left, right]);
assert_eq!(intersection.skip_next(2), SkipResult::Reached);
assert_eq!(intersection.seek(2), 2);
assert_eq!(intersection.doc(), 2);
}
@@ -312,7 +243,7 @@ mod tests {
let a = VecDocSet::from(vec![1, 3]);
let b = VecDocSet::from(vec![1, 4]);
let c = VecDocSet::from(vec![3, 9]);
let mut intersection = Intersection::new(vec![a, b, c]);
assert!(!intersection.advance());
let intersection = Intersection::new(vec![a, b, c]);
assert_eq!(intersection.doc(), TERMINATED);
}
}

View File

@@ -3,8 +3,6 @@
mod all_query;
mod automaton_weight;
mod bitset;
mod block_max_scorer;
mod block_max_wand;
mod bm25;
mod boolean_query;
mod boost_query;
@@ -37,7 +35,6 @@ pub use self::vec_docset::VecDocSet;
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet;
pub use self::block_max_scorer::BlockMaxScorer;
pub use self::boolean_query::BooleanQuery;
pub use self::boost_query::BoostQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
@@ -59,8 +56,6 @@ pub use self::scorer::Scorer;
pub use self::term_query::TermQuery;
pub use self::weight::Weight;
pub use tantivy_query_grammar::Occur;
pub use self::weight::PruningScorerIfPossible;
#[cfg(test)]
mod tests {

View File

@@ -60,8 +60,8 @@ pub mod tests {
.map(|docaddr| docaddr.1)
.collect::<Vec<_>>()
};
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty());

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::BM25Weight;
@@ -25,12 +25,12 @@ impl<TPostings: Postings> PostingsWithOffset<TPostings> {
}
impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.postings.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
fn seek(&mut self, target: DocId) -> DocId {
self.postings.seek(target)
}
fn doc(&self) -> DocId {
@@ -149,7 +149,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
PhraseScorer {
let mut scorer = PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_terms: num_docsets,
left: Vec::with_capacity(100),
@@ -158,7 +158,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
similarity_weight,
fieldnorm_reader,
score_needed,
};
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
scorer.advance();
}
scorer
}
pub fn phrase_count(&self) -> u32 {
@@ -225,31 +229,21 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
fn advance(&mut self) -> bool {
while self.intersection_docset.advance() {
if self.phrase_match() {
return true;
fn advance(&mut self) -> DocId {
loop {
let doc = self.intersection_docset.advance();
if doc == TERMINATED || self.phrase_match() {
return doc;
}
}
false
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if self.intersection_docset.skip_next(target) == SkipResult::End {
return SkipResult::End;
}
if self.phrase_match() {
if self.doc() == target {
return SkipResult::Reached;
} else {
return SkipResult::OverStep;
}
}
if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
fn seek(&mut self, target: DocId) -> DocId {
let doc = self.intersection_docset.seek(target);
if doc == TERMINATED || self.phrase_match() {
return doc;
}
self.advance()
}
fn doc(&self) -> DocId {

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Result;
use crate::{DocId, DocSet};
use crate::{Result, SkipResult};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -99,7 +99,7 @@ impl Weight for PhraseWeight {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.skip_next(doc) != SkipResult::Reached {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader);
@@ -114,6 +114,7 @@ impl Weight for PhraseWeight {
#[cfg(test)]
mod tests {
use super::super::tests::create_index;
use crate::docset::TERMINATED;
use crate::query::PhraseQuery;
use crate::{DocSet, Term};
@@ -132,12 +133,11 @@ mod tests {
.phrase_scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap()
.unwrap();
assert!(phrase_scorer.advance());
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert!(phrase_scorer.advance());
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert!(!phrase_scorer.advance());
assert_eq!(phrase_scorer.advance(), TERMINATED);
}
}

View File

@@ -113,8 +113,9 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
/// The terms are then searched within the default terms of the query parser.
/// tantivy's [`SimpleTokenizer`](tantivy::tokenizer::SimpleTokenizer), hence
/// becoming `["barack", "obama"]`. The terms are then searched within
/// the default terms of the query parser.
///
/// e.g. If `body` and `title` are default fields, our example terms are
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.

View File

@@ -10,7 +10,7 @@ use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::DocId;
use crate::{Result, SkipResult};
use crate::Result;
use std::collections::Bound;
use std::ops::Range;
@@ -312,7 +312,7 @@ impl Weight for RangeWeight {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0f32)?;
if scorer.skip_next(doc) != SkipResult::Reached {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(Explanation::new("RangeQuery", 1.0f32))

View File

@@ -1,9 +1,8 @@
use crate::docset::{DocSet, SkipResult};
use crate::docset::DocSet;
use crate::query::score_combiner::ScoreCombiner;
use crate::query::Scorer;
use crate::DocId;
use crate::Score;
use std::cmp::Ordering;
use std::marker::PhantomData;
/// Given a required scorer and an optional scorer
@@ -17,7 +16,6 @@ pub struct RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
req_scorer: TReqScorer,
opt_scorer: TOptScorer,
score_cache: Option<Score>,
opt_finished: bool,
_phantom: PhantomData<TScoreCombiner>,
}
@@ -29,14 +27,12 @@ where
/// Creates a new `RequiredOptionalScorer`.
pub fn new(
req_scorer: TReqScorer,
mut opt_scorer: TOptScorer,
opt_scorer: TOptScorer,
) -> RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
let opt_finished = !opt_scorer.advance();
RequiredOptionalScorer {
req_scorer,
opt_scorer,
score_cache: None,
opt_finished,
_phantom: PhantomData,
}
}
@@ -48,7 +44,7 @@ where
TReqScorer: DocSet,
TOptScorer: DocSet,
{
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.score_cache = None;
self.req_scorer.advance()
}
@@ -76,22 +72,8 @@ where
let doc = self.doc();
let mut score_combiner = TScoreCombiner::default();
score_combiner.update(&mut self.req_scorer);
if !self.opt_finished {
match self.opt_scorer.doc().cmp(&doc) {
Ordering::Greater => {}
Ordering::Equal => {
score_combiner.update(&mut self.opt_scorer);
}
Ordering::Less => match self.opt_scorer.skip_next(doc) {
SkipResult::Reached => {
score_combiner.update(&mut self.opt_scorer);
}
SkipResult::End => {
self.opt_finished = true;
}
SkipResult::OverStep => {}
},
}
if self.opt_scorer.seek(doc) == doc {
score_combiner.update(&mut self.opt_scorer);
}
let score = score_combiner.score();
self.score_cache = Some(score);
@@ -102,7 +84,7 @@ where
#[cfg(test)]
mod tests {
use super::RequiredOptionalScorer;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::score_combiner::{DoNothingCombiner, SumCombiner};
use crate::query::ConstScorer;
@@ -119,9 +101,7 @@ mod tests {
ConstScorer::from(VecDocSet::from(vec![])),
);
let mut docs = vec![];
while reqoptscorer.advance() {
docs.push(reqoptscorer.doc());
}
reqoptscorer.for_each(&mut |doc, _| docs.push(doc));
assert_eq!(docs, req);
}
@@ -133,46 +113,45 @@ mod tests {
ConstScorer::new(VecDocSet::from(vec![1, 2, 7, 11, 12, 15]), 1.0f32),
);
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.doc(), 1);
assert_eq!(reqoptscorer.score(), 2f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 3);
assert_eq!(reqoptscorer.doc(), 3);
assert_eq!(reqoptscorer.score(), 1f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 7);
assert_eq!(reqoptscorer.doc(), 7);
assert_eq!(reqoptscorer.score(), 2f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 8);
assert_eq!(reqoptscorer.doc(), 8);
assert_eq!(reqoptscorer.score(), 1f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 9);
assert_eq!(reqoptscorer.doc(), 9);
assert_eq!(reqoptscorer.score(), 1f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 10);
assert_eq!(reqoptscorer.doc(), 10);
assert_eq!(reqoptscorer.score(), 1f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 13);
assert_eq!(reqoptscorer.doc(), 13);
assert_eq!(reqoptscorer.score(), 1f32);
}
{
assert!(reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), 15);
assert_eq!(reqoptscorer.doc(), 15);
assert_eq!(reqoptscorer.score(), 2f32);
}
assert!(!reqoptscorer.advance());
assert_eq!(reqoptscorer.advance(), TERMINATED);
}
#[test]

View File

@@ -1,5 +1,4 @@
use crate::common::BitSet;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use crate::Score;
use downcast_rs::impl_downcast;
@@ -17,8 +16,35 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
/// Iterates through all of the document matched by the DocSet
/// `DocSet` and push the scored documents to the collector.
fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) {
while self.advance() {
callback(self.doc(), self.score());
let mut doc = self.doc();
while doc != TERMINATED {
callback(doc, self.score());
doc = self.advance();
}
}
/// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold.
///
/// This method is useful for the TopDocs collector.
/// For all docsets, the blanket implementation has the benefit
/// of prefiltering (doc, score) pairs, avoiding the
/// virtual dispatch cost.
///
/// More importantly, it makes it possible for scorers to implement
/// important optimization (e.g. BlockWAND for union).
fn for_each_pruning(
&mut self,
mut threshold: f32,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) {
let mut doc = self.doc();
while doc != TERMINATED {
let score = self.score();
if score > threshold {
threshold = callback(doc, score);
}
doc = self.advance();
}
}
}
@@ -61,12 +87,12 @@ impl<TDocSet: DocSet> From<TDocSet> for ConstScorer<TDocSet> {
}
impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.docset.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.docset.skip_next(target)
fn seek(&mut self, target: DocId) -> DocId {
self.docset.seek(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
@@ -80,10 +106,6 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
fn size_hint(&self) -> u32 {
self.docset.size_hint()
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
self.docset.append_to_bitset(bitset);
}
}
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {

View File

@@ -1,98 +0,0 @@
use crate::docset::{DocSet, SkipResult};
use crate::query::{Explanation, Scorer};
use crate::DocId;
use crate::Score;
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::postings::{BlockMaxPostings, BlockMaxSegmentPostings};
use crate::query::bm25::BM25Weight;
use crate::query::BlockMaxScorer;
pub struct BlockMaxTermScorer {
postings: BlockMaxSegmentPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: BM25Weight,
}
impl BlockMaxTermScorer {
pub fn new(
postings: BlockMaxSegmentPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: BM25Weight,
) -> Self {
Self {
postings,
fieldnorm_reader,
similarity_weight,
}
}
}
impl BlockMaxTermScorer {
fn _score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
self.similarity_weight.score(fieldnorm_id, term_freq)
}
pub fn term_freq(&self) -> u32 {
self.postings.term_freq()
}
pub fn fieldnorm_id(&self) -> u8 {
self.fieldnorm_reader.fieldnorm_id(self.doc())
}
pub fn explain(&self) -> Explanation {
let fieldnorm_id = self.fieldnorm_id();
let term_freq = self.term_freq();
self.similarity_weight.explain(fieldnorm_id, term_freq)
}
}
impl DocSet for BlockMaxTermScorer {
fn advance(&mut self) -> bool {
self.postings.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
}
fn doc(&self) -> DocId {
self.postings.doc()
}
fn size_hint(&self) -> u32 {
self.postings.size_hint()
}
}
impl Scorer for BlockMaxTermScorer {
fn score(&mut self) -> Score {
self._score(
self.fieldnorm_reader.fieldnorm_id(self.doc()),
self.postings.term_freq(),
)
}
}
impl BlockMaxScorer for BlockMaxTermScorer {
fn block_max_score(&mut self) -> Score {
self._score(
self.fieldnorm_reader
.fieldnorm_id(self.postings.block_max_doc()),
self.postings.term_freq(),
)
}
fn block_max_doc(&mut self) -> DocId {
self.postings.block_max_doc()
}
fn max_score(&self) -> Score {
self._score(
self.fieldnorm_reader.fieldnorm_id(self.postings.max_doc()),
self.postings.max_term_freq(),
)
}
}

View File

@@ -1,9 +1,7 @@
mod block_max_term_scorer;
mod term_query;
mod term_scorer;
mod term_weight;
pub use self::block_max_term_scorer::BlockMaxTermScorer;
pub use self::term_query::TermQuery;
pub use self::term_scorer::TermScorer;
pub use self::term_weight::TermWeight;
@@ -28,10 +26,8 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
}
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
@@ -42,7 +38,6 @@ mod tests {
let term_weight = term_query.weight(&searcher, true).unwrap();
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader, 1.0f32).unwrap();
assert!(term_scorer.advance());
assert_eq!(term_scorer.doc(), 0);
assert_eq!(term_scorer.score(), 0.28768212);
}

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, SkipResult};
use crate::docset::DocSet;
use crate::query::{Explanation, Scorer};
use crate::DocId;
use crate::Score;
@@ -45,12 +45,12 @@ impl TermScorer {
}
impl DocSet for TermScorer {
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
self.postings.advance()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.skip_next(target)
fn seek(&mut self, target: DocId) -> DocId {
self.postings.seek(target)
}
fn doc(&self) -> DocId {

View File

@@ -8,8 +8,8 @@ use crate::query::Weight;
use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption;
use crate::DocId;
use crate::Result;
use crate::Term;
use crate::{Result, SkipResult};
pub struct TermWeight {
term: Term,
@@ -25,7 +25,7 @@ impl Weight for TermWeight {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
if scorer.skip_next(doc) != SkipResult::Reached {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(scorer.explain())

View File

@@ -1,107 +1,13 @@
use crate::common::TinySet;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::{Scorer, BlockMaxScorer};
use crate::query::Scorer;
use crate::DocId;
use crate::Score;
use std::cmp::Ordering;
const HORIZON_NUM_TINYBITSETS: usize = 64;
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
struct Pivot {
position: usize,
first_occurrence: usize,
doc: DocId,
}
/// Find the position in the sorted list of posting lists of the **pivot**.
///
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
///
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
fn find_pivot_position<'a, TScorer>(
mut docsets: impl Iterator<Item = &'a TScorer>,
lower_bound_score: Score,
) -> Option<Pivot>
where TScorer: BlockMaxScorer
{
let mut position = 0;
let mut upper_bound = Score::default();
while let Some(docset) = docsets.next() {
upper_bound += docset.max_score();
if lower_bound_score < upper_bound {
let pivot_doc = docset.doc();
let first_occurrence = position;
while let Some(docset) = docsets.next() {
if docset.doc() != pivot_doc {
break;
} else {
position += 1;
}
}
return Some(Pivot {
position,
doc: pivot_doc,
first_occurrence,
});
}
position += 1;
}
None
}
/// Sifts down the first element of the slice.
///
/// `docsets[1..]` are assumed sorted.
/// This function swaps `docsets[0]` with its right
/// neighbor successively -bubble sort style- until it reaches the first
/// position such that `docsets` is sorted.
fn sift_down<TScorer>(docsets: &mut [TScorer])
where
TScorer: BlockMaxScorer + Scorer,
{
for idx in 1..docsets.len() {
if docsets[idx].doc() >= docsets[idx - 1].doc() {
return;
}
docsets.swap(idx, idx - 1);
}
}
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
/// scores.
fn find_next_relevant_doc<TScorer>(
docsets_up_to_pivot: &mut [TScorer],
pivot_docset: &mut TScorer,
docset_after_pivot: Option<&mut TScorer>,
) -> DocId
where
TScorer: BlockMaxScorer + Scorer,
{
let mut next_doc = 1 + docsets_up_to_pivot
.iter_mut()
.map(|docset| docset.block_max_doc())
.chain(std::iter::once(pivot_docset.block_max_doc()))
.min()
.unwrap();
if let Some(docset) = docset_after_pivot {
let doc = docset.doc();
if doc < next_doc {
next_doc = doc;
}
}
if next_doc <= pivot_docset.doc() {
pivot_docset.doc() + 1
} else {
next_doc
}
}
// `drain_filter` is not stable yet.
// This function is similar except that it does is not unstable, and
// it does not keep the original vector ordering.
@@ -132,7 +38,6 @@ pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
score: Score,
}
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
where
TScoreCombiner: ScoreCombiner,
@@ -141,17 +46,9 @@ where
fn from(docsets: Vec<TScorer>) -> Union<TScorer, TScoreCombiner> {
let non_empty_docsets: Vec<TScorer> = docsets
.into_iter()
.flat_map(
|mut docset| {
if docset.advance() {
Some(docset)
} else {
None
}
},
)
.filter(|docset| docset.doc() != TERMINATED)
.collect();
Union {
let mut union = Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([TScoreCombiner::default(); HORIZON as usize]),
@@ -159,7 +56,13 @@ where
offset: 0,
doc: 0,
score: 0f32,
};
if union.refill() {
union.advance();
} else {
union.doc = TERMINATED;
}
union
}
}
@@ -180,7 +83,7 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
let delta = doc - min_doc;
bitsets[(delta / 64) as usize].insert_mut(delta % 64u32);
score_combiner[delta as usize].update(scorer);
if !scorer.advance() {
if scorer.advance() == TERMINATED {
// remove the docset, it has been entirely consumed.
return true;
}
@@ -193,6 +96,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
self.offset = min_doc;
self.cursor = 0;
self.doc = min_doc;
refill(
&mut self.docsets,
&mut *self.bitsets,
@@ -220,76 +124,6 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
}
false
}
}
impl<TScorer: BlockMaxScorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
let block_upper_bound: Score = self.docsets[..=pivot.position]
.iter_mut()
.map(|docset| docset.block_max_score())
.sum();
if block_upper_bound > lower_bound_score {
if pivot.doc == self.docsets[0].doc() {
// Since self.docsets is sorted by their current doc, in this branch, all
// docsets in [0..=pivot] are positioned on pivot.doc.
//
// Lets compute the actual score for this doc.
//
// NOTE(elshize): One additional check needs to be done to improve performance:
// update block-wise bound while accumulating score with the actual score,
// and check each time if still above threshold.
let mut combiner = TScoreCombiner::default();
for idx in (0..=pivot.position).rev() {
combiner.update(&mut self.docsets[idx]);
if !self.docsets[idx].advance() {
self.docsets.swap_remove(idx);
}
}
self.score = combiner.score();
self.doc = pivot.doc;
self.docsets.sort_by_key(TScorer::doc);
SkipResult::Reached
} else {
// The substraction does not underflow because otherwise we would go to the other
// branch.
//
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
let advanced_idx = pivot.first_occurrence - 1;
if !self.docsets[advanced_idx].advance() {
self.docsets.swap_remove(advanced_idx);
}
if self.docsets.is_empty() {
return SkipResult::End;
}
sift_down(&mut self.docsets[advanced_idx..]);
SkipResult::OverStep
}
} else {
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
// NOTE(elshize): It might be more efficient to advance the list with the higher
// max score, but let's advance the first one for now for simplicity.
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
self.docsets.swap_remove(0);
}
if self.docsets.is_empty() {
return SkipResult::End;
}
sift_down(&mut self.docsets[..]);
SkipResult::OverStep
}
}
/// Find the position in the sorted list of posting lists of the **pivot**.
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
find_pivot_position(
self.docsets.iter().map(|docset| docset),
lower_bound_score)
}
}
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
@@ -297,30 +131,23 @@ where
TScorer: Scorer,
TScoreCombiner: ScoreCombiner,
{
fn advance(&mut self) -> bool {
fn advance(&mut self) -> DocId {
if self.advance_buffered() {
return true;
return self.doc;
}
if self.refill() {
self.advance();
true
} else {
false
if !self.refill() {
self.doc = TERMINATED;
return TERMINATED;
}
if !self.advance_buffered() {
return TERMINATED;
}
self.doc
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
}
match self.doc.cmp(&target) {
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
return SkipResult::OverStep;
}
Ordering::Less => {}
fn seek(&mut self, target: DocId) -> DocId {
if self.doc >= target {
return self.doc;
}
let gap = target - self.offset;
if gap < HORIZON {
@@ -338,18 +165,11 @@ where
// Advancing until we reach the end of the bucket
// or we reach a doc greater or equal to the target.
while self.advance() {
match self.doc().cmp(&target) {
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
return SkipResult::OverStep;
}
Ordering::Less => {}
}
let mut doc = self.doc();
while doc < target {
doc = self.advance();
}
SkipResult::End
doc
} else {
// clear the buffered info.
for obsolete_tinyset in self.bitsets.iter_mut() {
@@ -363,45 +183,30 @@ where
// advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
if docset.skip_next(target) == SkipResult::End {
return true;
}
}
false
docset.seek(target) == TERMINATED
});
// at this point all of the docsets
// are positionned on a doc >= to the target.
if self.refill() {
self.advance();
if self.doc() == target {
SkipResult::Reached
} else {
debug_assert!(self.doc() > target);
SkipResult::OverStep
}
} else {
SkipResult::End
if !self.refill() {
self.doc = TERMINATED;
return TERMINATED;
}
self.advance()
}
}
// TODO implement `count` efficiently.
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
0u32
}
// TODO Also implement `count` with deletes efficiently.
fn count_including_deleted(&mut self) -> u32 {
if self.doc == TERMINATED {
return 0;
}
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
.iter()
.map(|bitset| bitset.len())
.sum::<u32>();
.sum::<u32>()
+ 1;
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
@@ -414,6 +219,18 @@ where
self.cursor = HORIZON_NUM_TINYBITSETS;
count
}
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
}
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
@@ -431,7 +248,7 @@ mod tests {
use super::Union;
use super::HORIZON;
use crate::docset::{DocSet, SkipResult};
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::ConstScorer;
@@ -460,12 +277,12 @@ mod tests {
};
let mut union: Union<_, DoNothingCombiner> = make_union();
let mut count = 0;
while union.advance() {
assert!(union_expected.advance());
while union.doc() != TERMINATED {
assert_eq!(union_expected.doc(), union.doc());
assert_eq!(union_expected.advance(), union.advance());
count += 1;
}
assert!(!union_expected.advance());
assert_eq!(union_expected.advance(), TERMINATED);
assert_eq!(count, make_union().count_including_deleted());
}
@@ -493,9 +310,7 @@ mod tests {
fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
let mut btree_set = BTreeSet::new();
for docs in docs_list {
for &doc in docs.iter() {
btree_set.insert(doc);
}
btree_set.extend(docs.iter().cloned());
}
let docset_factory = || {
let res: Box<dyn DocSet> = Box::new(Union::<_, DoNothingCombiner>::from(
@@ -510,10 +325,10 @@ mod tests {
};
let mut docset = docset_factory();
for el in btree_set {
assert!(docset.advance());
assert_eq!(el, docset.doc());
docset.advance();
}
assert!(!docset.advance());
assert_eq!(docset.doc(), TERMINATED);
test_skip_against_unoptimized(docset_factory, skip_targets);
}
@@ -536,10 +351,10 @@ mod tests {
ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
]);
assert!(docset.advance());
assert_eq!(docset.doc(), 0u32);
assert_eq!(docset.skip_next(0u32), SkipResult::OverStep);
assert_eq!(docset.doc(), 1u32)
assert_eq!(docset.seek(0u32), 0u32);
assert_eq!(docset.seek(0u32), 0u32);
assert_eq!(docset.doc(), 0u32)
}
#[test]

View File

@@ -1,9 +1,8 @@
#![allow(dead_code)]
use crate::common::HasLen;
use crate::docset::DocSet;
use crate::docset::{DocSet, TERMINATED};
use crate::DocId;
use std::num::Wrapping;
/// Simulate a `Postings` objects from a `VecPostings`.
/// `VecPostings` only exist for testing purposes.
@@ -12,26 +11,30 @@ use std::num::Wrapping;
/// No positions are returned.
pub struct VecDocSet {
doc_ids: Vec<DocId>,
cursor: Wrapping<usize>,
cursor: usize,
}
impl From<Vec<DocId>> for VecDocSet {
fn from(doc_ids: Vec<DocId>) -> VecDocSet {
VecDocSet {
doc_ids,
cursor: Wrapping(usize::max_value()),
}
VecDocSet { doc_ids, cursor: 0 }
}
}
impl DocSet for VecDocSet {
fn advance(&mut self) -> bool {
self.cursor += Wrapping(1);
self.doc_ids.len() > self.cursor.0
fn advance(&mut self) -> DocId {
self.cursor += 1;
if self.cursor >= self.doc_ids.len() {
self.cursor = self.doc_ids.len();
return TERMINATED;
}
self.doc()
}
fn doc(&self) -> DocId {
self.doc_ids[self.cursor.0]
if self.cursor == self.doc_ids.len() {
return TERMINATED;
}
self.doc_ids[self.cursor]
}
fn size_hint(&self) -> u32 {
@@ -49,22 +52,21 @@ impl HasLen for VecDocSet {
pub mod tests {
use super::*;
use crate::docset::{DocSet, SkipResult};
use crate::docset::DocSet;
use crate::DocId;
#[test]
pub fn test_vec_postings() {
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
let mut postings = VecDocSet::from(doc_ids);
assert!(postings.advance());
assert_eq!(postings.doc(), 0u32);
assert!(postings.advance());
assert_eq!(postings.advance(), 3u32);
assert_eq!(postings.doc(), 3u32);
assert_eq!(postings.skip_next(14u32), SkipResult::OverStep);
assert_eq!(postings.seek(14u32), 15u32);
assert_eq!(postings.doc(), 15u32);
assert_eq!(postings.skip_next(300u32), SkipResult::Reached);
assert_eq!(postings.seek(300u32), 300u32);
assert_eq!(postings.doc(), 300u32);
assert_eq!(postings.skip_next(6000u32), SkipResult::End);
assert_eq!(postings.seek(6000u32), TERMINATED);
}
#[test]

View File

@@ -1,26 +1,7 @@
use super::Scorer;
use crate::core::SegmentReader;
use crate::query::Explanation;
use crate::{DocId, Score};
pub trait PruningScorer {
fn doc(&self) -> DocId;
fn score(&self) -> Score;
/// Advance to the next document that has a score strictly greater than
/// `lower_bound_score`.
fn advance_with_pruning(&mut self, score_lower_bound: f32) -> bool;
fn advance(&mut self) -> bool {
self.advance_with_pruning(std::f32::NEG_INFINITY)
}
}
pub enum PruningScorerIfPossible {
Pruning(Box<dyn PruningScorer>),
NonPruning(Box<dyn Scorer>)
}
use crate::DocId;
/// A Weight is the specialization of a Query
/// for a given set of segments.
@@ -34,11 +15,6 @@ pub trait Weight: Send + Sync + 'static {
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<Box<dyn Scorer>>;
fn pruning_scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<PruningScorerIfPossible> {
let scorer = self.scorer(reader, boost)?;
Ok(PruningScorerIfPossible::NonPruning(Box::new(scorer)))
}
/// Returns an `Explanation` for the given document.
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;

View File

@@ -12,13 +12,13 @@ pub struct Field(u32);
impl Field {
/// Create a new field object for the given FieldId.
pub fn from_field_id(field_id: u32) -> Field {
pub const fn from_field_id(field_id: u32) -> Field {
Field(field_id)
}
/// Returns a u32 identifying uniquely a field within a schema.
#[allow(clippy::trivially_copy_pass_by_ref)]
pub fn field_id(&self) -> u32 {
pub const fn field_id(&self) -> u32 {
self.0
}
}