mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 13:02:55 +00:00
Compare commits
21 Commits
cutt
...
removedali
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8861919d5f | ||
|
|
c0f5645cd9 | ||
|
|
cbff874e43 | ||
|
|
baf015fc57 | ||
|
|
7275ebdf3c | ||
|
|
b974e7ce34 | ||
|
|
8f8f34499f | ||
|
|
6ea6f4bfcd | ||
|
|
e25284bafe | ||
|
|
8b67877cd5 | ||
|
|
9de1360538 | ||
|
|
c55db83609 | ||
|
|
1e5ebdbf3c | ||
|
|
9a2090ab21 | ||
|
|
e4aaacdb86 | ||
|
|
29acf1104d | ||
|
|
3d34fa0b69 | ||
|
|
77f363987a | ||
|
|
c0be461191 | ||
|
|
1fb562f44a | ||
|
|
c591d0e591 |
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,6 +1,22 @@
|
||||
Tantivy 0.13.0
|
||||
======================
|
||||
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
|
||||
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
|
||||
- `MMapDirectory::open` does not return a `Result` anymore.
|
||||
- Change in the DocSet and Scorer API. (@fulmicoton).
|
||||
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
|
||||
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
|
||||
As a result, iterating through DocSet now looks as follows
|
||||
```rust
|
||||
let mut doc = docset.doc();
|
||||
while doc != TERMINATED {
|
||||
// ...
|
||||
doc = docset.advance();
|
||||
}
|
||||
```
|
||||
The change made it possible to greatly simplify a lot of the docset's code.
|
||||
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
|
||||
- Added an offset option to the Top(.*)Collectors. (@robyoung)
|
||||
|
||||
Tantivy 0.12.0
|
||||
======================
|
||||
|
||||
@@ -18,7 +18,7 @@ byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
tantivy-fst = {path="../tantivy-fst", version="0.3"}
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = "1"
|
||||
@@ -45,7 +45,7 @@ fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.3"
|
||||
fail = "0.4"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1.0"
|
||||
@@ -60,7 +60,7 @@ maplit = "1"
|
||||
matches = "0.1.8"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.3"
|
||||
version = "0.4"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
|
||||
10
README.md
10
README.md
@@ -31,12 +31,16 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
|
||||
In general, Tantivy tends to be
|
||||
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
|
||||
- faster than Lucene on intersection and phrase queries.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
|
||||
98
examples/faceted_search_with_tweaked_score.rs
Normal file
98
examples/faceted_search_with_tweaked_score.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use std::collections::HashSet;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::doc;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient");
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/garlic"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let facets = vec![
|
||||
Facet::from("/ingredient/egg"),
|
||||
Facet::from("/ingredient/oil"),
|
||||
Facet::from("/ingredient/garlic"),
|
||||
Facet::from("/ingredient/mushroom"),
|
||||
];
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, &key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
let query_ords: HashSet<u64> = facets
|
||||
.iter()
|
||||
.filter_map(|key| facet_dict.term_ord(key.encoded_str()))
|
||||
.collect();
|
||||
|
||||
let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);
|
||||
|
||||
move |doc: DocId, original_score: Score| {
|
||||
ingredient_reader.facet_ords(doc, &mut facet_ords_buffer);
|
||||
let missing_ingredients = facet_ords_buffer
|
||||
.iter()
|
||||
.filter(|ord| !query_ords.contains(ord))
|
||||
.count();
|
||||
let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
|
||||
|
||||
original_score * tweak
|
||||
}
|
||||
});
|
||||
let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
|
||||
|
||||
let titles: Vec<String> = top_docs
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -10,7 +10,7 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocId, DocSet, Index, Postings};
|
||||
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
@@ -62,12 +62,11 @@ fn main() -> tantivy::Result<()> {
|
||||
{
|
||||
// this buffer will be used to request for positions
|
||||
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
||||
while segment_postings.advance() {
|
||||
// the number of time the term appears in the document.
|
||||
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
|
||||
|
||||
let mut doc_id = segment_postings.doc();
|
||||
while doc_id != TERMINATED {
|
||||
// This MAY contains deleted documents as well.
|
||||
if segment_reader.is_deleted(doc_id) {
|
||||
doc_id = segment_postings.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -86,6 +85,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// Doc 2: TermFreq 1: [0]
|
||||
// ```
|
||||
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
|
||||
doc_id = segment_postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,13 +11,13 @@ impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
{
|
||||
pub fn new(
|
||||
pub(crate) fn new(
|
||||
custom_scorer: TCustomScorer,
|
||||
limit: usize,
|
||||
collector: TopCollector<TScore>,
|
||||
) -> CustomScoreTopCollector<TCustomScorer, TScore> {
|
||||
CustomScoreTopCollector {
|
||||
custom_scorer,
|
||||
collector: TopCollector::with_limit(limit),
|
||||
collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,7 @@ where
|
||||
/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
|
||||
pub trait CustomSegmentScorer<TScore>: 'static {
|
||||
/// Computes the score of a specific `doc`.
|
||||
fn score(&self, doc: DocId) -> TScore;
|
||||
fn score(&mut self, doc: DocId) -> TScore;
|
||||
}
|
||||
|
||||
/// `CustomScorer` makes it possible to define any kind of score.
|
||||
@@ -117,9 +117,9 @@ where
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId) -> TScore,
|
||||
F: 'static + FnMut(DocId) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId) -> TScore {
|
||||
fn score(&mut self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::docset::SkipResult;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::Field;
|
||||
@@ -188,6 +187,11 @@ pub struct FacetSegmentCollector {
|
||||
collapse_facet_ords: Vec<u64>,
|
||||
}
|
||||
|
||||
enum SkipResult {
|
||||
Found,
|
||||
NotFound,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
target: &[u8],
|
||||
collapse_it: &mut Peekable<I>,
|
||||
@@ -197,14 +201,14 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
return SkipResult::NotFound;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
return SkipResult::Found;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
return SkipResult::End;
|
||||
return SkipResult::NotFound;
|
||||
}
|
||||
}
|
||||
collapse_it.next();
|
||||
@@ -281,7 +285,7 @@ impl Collector for FacetCollector {
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
SkipResult::Found => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
@@ -301,7 +305,7 @@ impl Collector for FacetCollector {
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
SkipResult::NotFound => {
|
||||
collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
|
||||
@@ -109,6 +109,7 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
use crate::query::Weight;
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
@@ -154,6 +155,29 @@ pub trait Collector: Sync {
|
||||
/// Combines the fruit associated to the collection of each segments
|
||||
/// into one fruit.
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>;
|
||||
|
||||
/// Created a segment collector and
|
||||
fn collect_segment(
|
||||
&self,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
})?;
|
||||
} else {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
segment_collector.collect(doc, score);
|
||||
})?;
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SegmentCollector` is the trait in charge of defining the
|
||||
|
||||
@@ -18,9 +18,9 @@ use std::collections::BinaryHeap;
|
||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||
/// struct is never public.
|
||||
struct ComparableDoc<T, D> {
|
||||
feature: T,
|
||||
doc: D,
|
||||
pub(crate) struct ComparableDoc<T, D> {
|
||||
pub feature: T,
|
||||
pub doc: D,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||
@@ -56,7 +56,8 @@ impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
pub(crate) struct TopCollector<T> {
|
||||
limit: usize,
|
||||
pub limit: usize,
|
||||
pub offset: usize,
|
||||
_marker: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -72,14 +73,20 @@ where
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
TopCollector {
|
||||
Self {
|
||||
limit,
|
||||
offset: 0,
|
||||
_marker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn limit(&self) -> usize {
|
||||
self.limit
|
||||
/// Skip the first "offset" documents when collecting.
|
||||
///
|
||||
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
|
||||
/// Lucene's TopDocsCollector.
|
||||
pub fn and_offset(mut self, offset: usize) -> TopCollector<T> {
|
||||
self.offset = offset;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn merge_fruits(
|
||||
@@ -92,7 +99,7 @@ where
|
||||
let mut top_collector = BinaryHeap::new();
|
||||
for child_fruit in children {
|
||||
for (feature, doc) in child_fruit {
|
||||
if top_collector.len() < self.limit {
|
||||
if top_collector.len() < (self.limit + self.offset) {
|
||||
top_collector.push(ComparableDoc { feature, doc });
|
||||
} else if let Some(mut head) = top_collector.peek_mut() {
|
||||
if head.feature < feature {
|
||||
@@ -104,6 +111,7 @@ where
|
||||
Ok(top_collector
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.skip(self.offset)
|
||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||
.collect())
|
||||
}
|
||||
@@ -113,7 +121,23 @@ where
|
||||
segment_id: SegmentLocalId,
|
||||
_: &SegmentReader,
|
||||
) -> crate::Result<TopSegmentCollector<F>> {
|
||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||
Ok(TopSegmentCollector::new(
|
||||
segment_id,
|
||||
self.limit + self.offset,
|
||||
))
|
||||
}
|
||||
|
||||
/// Create a new TopCollector with the same limit and offset.
|
||||
///
|
||||
/// Ideally we would use Into but the blanket implementation seems to cause the Scorer traits
|
||||
/// to fail.
|
||||
#[doc(hidden)]
|
||||
pub(crate) fn into_tscore<TScore: PartialOrd + Clone>(self) -> TopCollector<TScore> {
|
||||
TopCollector {
|
||||
limit: self.limit,
|
||||
offset: self.offset,
|
||||
_marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,7 +211,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TopSegmentCollector;
|
||||
use super::{TopCollector, TopSegmentCollector};
|
||||
use crate::DocAddress;
|
||||
|
||||
#[test]
|
||||
@@ -248,6 +272,48 @@ mod tests {
|
||||
top_collector_limit_3.harvest()[..2].to_vec(),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_with_limit_and_offset() {
|
||||
let collector = TopCollector::with_limit(2).and_offset(1);
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![
|
||||
(0.9, DocAddress(0, 1)),
|
||||
(0.8, DocAddress(0, 2)),
|
||||
(0.7, DocAddress(0, 3)),
|
||||
(0.6, DocAddress(0, 4)),
|
||||
(0.5, DocAddress(0, 5)),
|
||||
]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
results,
|
||||
vec![(0.8, DocAddress(0, 2)), (0.7, DocAddress(0, 3)),]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_with_limit_larger_than_set_and_offset() {
|
||||
let collector = TopCollector::with_limit(2).and_offset(1);
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results, vec![(0.8, DocAddress(0, 2)),]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_with_limit_and_offset_larger_than_set() {
|
||||
let collector = TopCollector::with_limit(2).and_offset(20);
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results, vec![]);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
use super::Collector;
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
use crate::collector::top_collector::TopCollector;
|
||||
use crate::collector::top_collector::TopSegmentCollector;
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector};
|
||||
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
@@ -57,7 +59,11 @@ pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
impl fmt::Debug for TopDocs {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TopDocs({})", self.0.limit())
|
||||
write!(
|
||||
f,
|
||||
"TopDocs(limit={}, offset={})",
|
||||
self.0.limit, self.0.offset
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +72,7 @@ struct ScorerByFastFieldReader {
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&self, doc: DocId) -> u64 {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get_u64(u64::from(doc))
|
||||
}
|
||||
}
|
||||
@@ -101,6 +107,45 @@ impl TopDocs {
|
||||
TopDocs(TopCollector::with_limit(limit))
|
||||
}
|
||||
|
||||
/// Skip the first "offset" documents when collecting.
|
||||
///
|
||||
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
|
||||
/// Lucene's TopDocsCollector.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// assert_eq!(&top_docs[0], &(0.5204813, DocAddress(0, 4)));
|
||||
/// assert_eq!(&top_docs[1], &(0.4793185, DocAddress(0, 3)));
|
||||
/// ```
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
}
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// ```rust
|
||||
@@ -281,7 +326,7 @@ impl TopDocs {
|
||||
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
|
||||
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
|
||||
{
|
||||
TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
|
||||
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore())
|
||||
}
|
||||
|
||||
/// Ranks the documents using a custom score.
|
||||
@@ -395,7 +440,7 @@ impl TopDocs {
|
||||
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
|
||||
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
|
||||
{
|
||||
CustomScoreTopCollector::new(custom_score, self.0.limit())
|
||||
CustomScoreTopCollector::new(custom_score, self.0.into_tscore())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -423,6 +468,64 @@ impl Collector for TopDocs {
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
self.0.merge_fruits(child_fruits)
|
||||
}
|
||||
|
||||
fn collect_segment(
|
||||
&self,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let heap_len = self.0.limit + self.0.offset;
|
||||
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
let mut threshold = f32::MIN;
|
||||
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
|
||||
if delete_bitset.is_deleted(doc) {
|
||||
return threshold;
|
||||
}
|
||||
let heap_item = ComparableDoc {
|
||||
feature: score,
|
||||
doc,
|
||||
};
|
||||
if heap.len() < heap_len {
|
||||
heap.push(heap_item);
|
||||
if heap.len() == heap_len {
|
||||
threshold = heap.peek().map(|el| el.feature).unwrap_or(f32::MIN);
|
||||
}
|
||||
return threshold;
|
||||
}
|
||||
*heap.peek_mut().unwrap() = heap_item;
|
||||
threshold = heap.peek().map(|el| el.feature).unwrap_or(std::f32::MIN);
|
||||
threshold
|
||||
})?;
|
||||
} else {
|
||||
weight.for_each_pruning(f32::MIN, reader, &mut |doc, score| {
|
||||
let heap_item = ComparableDoc {
|
||||
feature: score,
|
||||
doc,
|
||||
};
|
||||
if heap.len() < heap_len {
|
||||
heap.push(heap_item);
|
||||
// TODO the threshold is suboptimal for heap.len == heap_len
|
||||
if heap.len() == heap_len {
|
||||
return heap.peek().map(|el| el.feature).unwrap_or(f32::MIN);
|
||||
} else {
|
||||
return f32::MIN;
|
||||
}
|
||||
}
|
||||
*heap.peek_mut().unwrap() = heap_item;
|
||||
heap.peek().map(|el| el.feature).unwrap_or(std::f32::MIN)
|
||||
})?;
|
||||
}
|
||||
|
||||
let fruit = heap
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|cid| (cid.feature, DocAddress(segment_ord, cid.doc)))
|
||||
.collect();
|
||||
Ok(fruit)
|
||||
}
|
||||
}
|
||||
|
||||
/// Segment Collector associated to `TopDocs`.
|
||||
@@ -432,7 +535,7 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.0.collect(doc, score)
|
||||
self.0.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(Score, DocAddress)> {
|
||||
@@ -446,10 +549,10 @@ mod tests {
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use crate::{DocAddress, DocId, SegmentReader};
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -489,6 +592,21 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4).and_offset(2))
|
||||
.unwrap();
|
||||
assert_eq!(score_docs, vec![(0.48527452, DocAddress(0, 0))]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let index = make_index();
|
||||
@@ -510,6 +628,27 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(2).and_offset(1))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
(0.48527452, DocAddress(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
@@ -623,6 +762,50 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tweak_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
||||
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
||||
);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &collector)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let collector = TopDocs::with_limit(2)
|
||||
.and_offset(1)
|
||||
.custom_score(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &collector)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
|
||||
);
|
||||
}
|
||||
|
||||
fn index(
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
|
||||
@@ -14,11 +14,11 @@ where
|
||||
{
|
||||
pub fn new(
|
||||
score_tweaker: TScoreTweaker,
|
||||
limit: usize,
|
||||
collector: TopCollector<TScore>,
|
||||
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
|
||||
TweakedScoreTopCollector {
|
||||
score_tweaker,
|
||||
collector: TopCollector::with_limit(limit),
|
||||
collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -29,7 +29,7 @@ where
|
||||
/// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html).
|
||||
pub trait ScoreSegmentTweaker<TScore>: 'static {
|
||||
/// Tweak the given `score` for the document `doc`.
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore;
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore;
|
||||
}
|
||||
|
||||
/// `ScoreTweaker` makes it possible to tweak the score
|
||||
@@ -121,9 +121,9 @@ where
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId, Score) -> TScore,
|
||||
F: 'static + FnMut(DocId, Score) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore {
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,10 @@ impl TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
@@ -43,6 +47,11 @@ impl TinySet {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
@@ -109,22 +118,12 @@ impl TinySet {
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize, //< Technically it should be u32, but we
|
||||
// count multiple inserts.
|
||||
// `usize` guards us from overflow.
|
||||
len: usize,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
@@ -204,7 +203,7 @@ mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
@@ -278,11 +277,13 @@ mod tests {
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
bitset_docset.advance();
|
||||
assert!(remaining);
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
remaining = bitset_docset.advance() != TERMINATED;
|
||||
}
|
||||
assert!(!bitset_docset.advance());
|
||||
assert!(!remaining);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
use num_cpus;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
@@ -3,9 +3,7 @@ use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::schema::FieldType;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::termdict::TermDictionary;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -97,8 +96,7 @@ impl InvertedIndexReader {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
let postings_reader = OwnedRead::new(postings_slice);
|
||||
block_postings.reset(term_info.doc_freq, postings_reader);
|
||||
block_postings.reset(term_info.doc_freq, postings_slice);
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `Term`.
|
||||
@@ -127,7 +125,7 @@ impl InvertedIndexReader {
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq,
|
||||
OwnedRead::new(postings_data),
|
||||
postings_data,
|
||||
self.record_option,
|
||||
requested_option,
|
||||
)
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::core::Executor;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::{Field, Term};
|
||||
@@ -17,26 +14,6 @@ use crate::Index;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn collect_segment<C: Collector>(
|
||||
collector: &C,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<C::Fruit> {
|
||||
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
@@ -163,12 +140,7 @@ impl Searcher {
|
||||
let segment_readers = self.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
collect_segment(
|
||||
collector,
|
||||
weight.as_ref(),
|
||||
segment_ord as u32,
|
||||
segment_reader,
|
||||
)
|
||||
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
|
||||
@@ -295,8 +295,8 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
|
||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
@@ -324,52 +324,6 @@ impl fmt::Debug for SegmentReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements the iterator trait to allow easy iteration
|
||||
/// over non-deleted ("alive") DocIds in a SegmentReader
|
||||
pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||
reader: &'a SegmentReader,
|
||||
max_doc: DocId,
|
||||
current: DocId,
|
||||
}
|
||||
|
||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||
SegmentReaderAliveDocsIterator {
|
||||
reader,
|
||||
max_doc: reader.max_doc(),
|
||||
current: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
|
||||
// find the next alive doc id
|
||||
while self.reader.is_deleted(self.current) {
|
||||
self.current += 1;
|
||||
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// capture the current alive DocId
|
||||
let result = Some(self.current);
|
||||
|
||||
// move down the chain
|
||||
self.current += 1;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::core::Index;
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
|
||||
use crc32fast::Hasher;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -1,10 +1,3 @@
|
||||
use fs2;
|
||||
use notify;
|
||||
|
||||
use self::fs2::FileExt;
|
||||
use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{
|
||||
@@ -20,8 +13,11 @@ use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use atomicwrites;
|
||||
use fs2::FileExt;
|
||||
use memmap::Mmap;
|
||||
use notify::RawEvent;
|
||||
use notify::RecursiveMode;
|
||||
use notify::Watcher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
@@ -224,17 +220,13 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(None),
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
@@ -268,14 +260,11 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
|
||||
Ok(MmapDirectory {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
@@ -285,7 +274,7 @@ impl MmapDirectory {
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
Ok(MmapDirectory::new(tempdir_path, Some(tempdir)))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
@@ -303,7 +292,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
130
src/docset.rs
130
src/docset.rs
@@ -1,58 +1,47 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum SkipResult {
|
||||
/// target was in the docset
|
||||
Reached,
|
||||
/// target was not in the docset, skipping stopped as a greater element was found
|
||||
OverStep,
|
||||
/// the docset was entirely consumed without finding the target, nor any
|
||||
/// element greater than the target.
|
||||
End,
|
||||
}
|
||||
/// Sentinel value returned when a DocSet has been entirely consumed.
|
||||
///
|
||||
/// This is not u32::MAX as one would have expected, due to the lack of SSE2 instructions
|
||||
/// to compare [u32; 4].
|
||||
pub const TERMINATED: DocId = std::i32::MAX as u32;
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet {
|
||||
/// Goes to the next element.
|
||||
/// `.advance(...)` needs to be called a first time to point to the correct
|
||||
/// element.
|
||||
fn advance(&mut self) -> bool;
|
||||
///
|
||||
/// The DocId of the next element is returned.
|
||||
/// In other words we should always have :
|
||||
/// ```ignore
|
||||
/// let doc = docset.advance();
|
||||
/// assert_eq!(doc, docset.doc());
|
||||
/// ```
|
||||
///
|
||||
/// If we reached the end of the DocSet, TERMINATED should be returned.
|
||||
///
|
||||
/// Calling `.advance()` on a terminated DocSet should be supported, and TERMINATED should
|
||||
/// be returned.
|
||||
/// TODO Test existing docsets.
|
||||
fn advance(&mut self) -> DocId;
|
||||
|
||||
/// After skipping, position the iterator in such a way that `.doc()`
|
||||
/// will return a value greater than or equal to target.
|
||||
/// Advances the DocSet forward until reaching the target, or going to the
|
||||
/// lowest DocId greater than the target.
|
||||
///
|
||||
/// SkipResult expresses whether the `target value` was reached, overstepped,
|
||||
/// or if the `DocSet` was entirely consumed without finding any value
|
||||
/// greater or equal to the `target`.
|
||||
/// If the end of the DocSet is reached, TERMINATED is returned.
|
||||
///
|
||||
/// WARNING: Calling skip always advances the docset.
|
||||
/// More specifically, if the docset is already positionned on the target
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
/// Calling `.seek(target)` on a terminated DocSet is legal. Implementation
|
||||
/// of DocSet should support it.
|
||||
///
|
||||
/// If `.skip_next()` oversteps, then the docset must be positionned correctly
|
||||
/// on an existing document. In other words, `.doc()` should return the first document
|
||||
/// greater than `DocId`.
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
loop {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Less => {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
Ordering::Equal => return SkipResult::Reached,
|
||||
Ordering::Greater => return SkipResult::OverStep,
|
||||
}
|
||||
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet.
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let mut doc = self.doc();
|
||||
while doc < target {
|
||||
doc = self.advance();
|
||||
}
|
||||
doc
|
||||
}
|
||||
|
||||
/// Fills a given mutable buffer with the next doc ids from the
|
||||
@@ -71,38 +60,38 @@ pub trait DocSet {
|
||||
/// use case where batching. The normal way to
|
||||
/// go through the `DocId`'s is to call `.advance()`.
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
if self.doc() == TERMINATED {
|
||||
return 0;
|
||||
}
|
||||
for (i, buffer_val) in buffer.iter_mut().enumerate() {
|
||||
if self.advance() {
|
||||
*buffer_val = self.doc();
|
||||
} else {
|
||||
return i;
|
||||
*buffer_val = self.doc();
|
||||
if self.advance() == TERMINATED {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
buffer.len()
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
/// Right after creating a new DocSet, the docset points to the first document.
|
||||
///
|
||||
/// If the DocSet is empty, .doc() should return `TERMINATED`.
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// length of the docset.
|
||||
fn size_hint(&self) -> u32;
|
||||
|
||||
/// Appends all docs to a `bitset`.
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
while self.advance() {
|
||||
bitset.insert(self.doc());
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
if !delete_bitset.is_deleted(self.doc()) {
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
count += 1u32;
|
||||
}
|
||||
doc = self.advance();
|
||||
}
|
||||
count
|
||||
}
|
||||
@@ -114,22 +103,42 @@ pub trait DocSet {
|
||||
/// given by `count()`.
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
count += 1u32;
|
||||
doc = self.advance();
|
||||
}
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
fn advance(&mut self) -> u32 {
|
||||
(**self).advance()
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
(**self).seek(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> u32 {
|
||||
(**self).doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
(**self).size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.skip_next(target)
|
||||
unboxed.seek(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -151,9 +160,4 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count_including_deleted()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.append_to_bitset(bitset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::directory::error::{Incompatibility, LockError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult};
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_delete_bitset;
|
||||
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
@@ -112,15 +112,15 @@ fn compute_deleted_bitset(
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
let mut deleted_doc = docset.doc();
|
||||
while deleted_doc != TERMINATED {
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc);
|
||||
might_have_changed = true;
|
||||
}
|
||||
deleted_doc = docset.advance();
|
||||
}
|
||||
}
|
||||
|
||||
delete_cursor.advance();
|
||||
}
|
||||
Ok(might_have_changed)
|
||||
@@ -346,7 +346,7 @@ impl IndexWriter {
|
||||
|
||||
fn drop_sender(&mut self) {
|
||||
let (sender, _receiver) = channel::bounded(1);
|
||||
mem::replace(&mut self.operation_sender, sender);
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
|
||||
@@ -54,10 +54,6 @@ impl LogMergePolicy {
|
||||
|
||||
impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
if segments.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(SegmentMeta::num_docs)
|
||||
@@ -67,27 +63,35 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
|
||||
if size_sorted_tuples.len() <= 1 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
|
||||
.collect();
|
||||
|
||||
let (first_ind, first_score) = size_sorted_log_tuples[0];
|
||||
let mut current_max_log_size = first_score;
|
||||
let mut levels = vec![vec![first_ind]];
|
||||
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
|
||||
if score < (current_max_log_size - self.level_log_size) {
|
||||
current_max_log_size = score;
|
||||
levels.push(Vec::new());
|
||||
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() {
|
||||
let mut current_max_log_size = first_score;
|
||||
let mut levels = vec![vec![first_ind]];
|
||||
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
|
||||
if score < (current_max_log_size - self.level_log_size) {
|
||||
current_max_log_size = score;
|
||||
levels.push(Vec::new());
|
||||
}
|
||||
levels.last_mut().unwrap().push(ind);
|
||||
}
|
||||
levels.last_mut().unwrap().push(ind);
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,6 +183,7 @@ mod tests {
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_merge_policy_small_segments() {
|
||||
// segments under min_layer_size are merged together
|
||||
@@ -194,6 +199,17 @@ mod tests {
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_merge_policy_all_segments_too_large_to_merge() {
|
||||
let eight_large_segments: Vec<SegmentMeta> =
|
||||
std::iter::repeat_with(|| create_random_segment_meta(100_001))
|
||||
.take(8)
|
||||
.collect();
|
||||
assert!(test_merge_policy()
|
||||
.compute_merge_candidates(&eight_large_segments)
|
||||
.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_merge_segments() {
|
||||
let test_input = vec![
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::common::MAX_DOC_LIMIT;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
@@ -574,10 +574,12 @@ impl IndexMerger {
|
||||
let inverted_index = segment_reader.inverted_index(indexed_field);
|
||||
let mut segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||
while segment_postings.advance() {
|
||||
if !segment_reader.is_deleted(segment_postings.doc()) {
|
||||
let mut doc = segment_postings.doc();
|
||||
while doc != TERMINATED {
|
||||
if !segment_reader.is_deleted(doc) {
|
||||
return Some((segment_ord, segment_postings));
|
||||
}
|
||||
doc = segment_postings.advance();
|
||||
}
|
||||
None
|
||||
})
|
||||
@@ -604,17 +606,9 @@ impl IndexMerger {
|
||||
// postings serializer.
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
loop {
|
||||
let doc = segment_postings.doc();
|
||||
|
||||
// `.advance()` has been called once before the loop.
|
||||
//
|
||||
// It was required to make sure we only consider segments
|
||||
// that effectively contain at least one non-deleted document
|
||||
// and remove terms that do not have documents associated.
|
||||
//
|
||||
// For this reason, we cannot use a `while segment_postings.advance()` loop.
|
||||
|
||||
let mut doc = segment_postings.doc();
|
||||
while doc != TERMINATED {
|
||||
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
||||
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
||||
// we make sure to only write the term iff
|
||||
@@ -629,9 +623,8 @@ impl IndexMerger {
|
||||
delta_positions,
|
||||
)?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
}
|
||||
|
||||
doc = segment_postings.advance();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
|
||||
41
src/lib.rs
41
src/lib.rs
@@ -156,7 +156,7 @@ mod snippet;
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
pub use crate::core::{Executor, SegmentComponent};
|
||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
@@ -285,7 +285,7 @@ mod tests {
|
||||
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::DocAddress;
|
||||
@@ -381,19 +381,12 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
{
|
||||
let doc = doc!(text_field=>"a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"a a"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"c");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
@@ -472,10 +465,12 @@ mod tests {
|
||||
}
|
||||
|
||||
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
let mut doc = docset.advance();
|
||||
while doc != TERMINATED {
|
||||
if !reader.is_deleted(doc) {
|
||||
return true;
|
||||
}
|
||||
doc = docset.advance();
|
||||
}
|
||||
false
|
||||
}
|
||||
@@ -641,9 +636,8 @@ mod tests {
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -665,9 +659,8 @@ mod tests {
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -689,9 +682,8 @@ mod tests {
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -760,10 +752,8 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
let doc = doc!(text_field=>"af af af bc bc");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
let doc = doc!(text_field=>"af af af bc bc");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
@@ -779,10 +769,9 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 3);
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,9 +37,9 @@ const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) a
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::{PositionReader, PositionSerializer};
|
||||
use super::PositionSerializer;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::positions::reader::PositionReader;
|
||||
use std::iter;
|
||||
|
||||
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
|
||||
@@ -68,7 +68,7 @@ pub mod tests {
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64);
|
||||
for &n in &[1, 10, 127, 128, 130, 312] {
|
||||
let mut v = vec![0u32; n];
|
||||
position_reader.read(&mut v[..n]);
|
||||
position_reader.read(0, &mut v[..]);
|
||||
for i in 0..n {
|
||||
assert_eq!(v[i], i as u32);
|
||||
}
|
||||
@@ -76,19 +76,19 @@ pub mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_skip() {
|
||||
let v: Vec<u32> = (0..1_000).collect();
|
||||
fn test_position_read_with_offset() {
|
||||
let v: Vec<u32> = (0..1000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 12);
|
||||
assert_eq!(stream.len(), 1168);
|
||||
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64);
|
||||
position_reader.skip(10);
|
||||
for &n in &[10, 127, COMPRESSION_BLOCK_SIZE, 130, 312] {
|
||||
let mut v = vec![0u32; n];
|
||||
position_reader.read(&mut v[..n]);
|
||||
for i in 0..n {
|
||||
assert_eq!(v[i], 10u32 + i as u32);
|
||||
for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] {
|
||||
for &len in &[1, 10, 130, 500] {
|
||||
let mut v = vec![0u32; len];
|
||||
position_reader.read(offset, &mut v[..]);
|
||||
for i in 0..len {
|
||||
assert_eq!(v[i], i as u32 + offset as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,11 +103,12 @@ pub mod tests {
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64);
|
||||
let mut buf = [0u32; 7];
|
||||
let mut c = 0;
|
||||
|
||||
let mut offset = 0;
|
||||
for _ in 0..100 {
|
||||
position_reader.read(&mut buf);
|
||||
position_reader.read(&mut buf);
|
||||
position_reader.skip(4);
|
||||
position_reader.skip(3);
|
||||
position_reader.read(offset, &mut buf);
|
||||
position_reader.read(offset, &mut buf);
|
||||
offset += 7;
|
||||
for &el in &buf {
|
||||
assert_eq!(c, el);
|
||||
c += 1;
|
||||
@@ -115,6 +116,58 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_reread_anchor_different_than_block() {
|
||||
let v: Vec<u32> = (0..2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
|
||||
let mut buf = [0u32; 256];
|
||||
position_reader.read(128, &mut buf);
|
||||
for i in 0..256 {
|
||||
assert_eq!(buf[i], (128 + i) as u32);
|
||||
}
|
||||
position_reader.read(128, &mut buf);
|
||||
for i in 0..256 {
|
||||
assert_eq!(buf[i], (128 + i) as u32);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "offset arguments should be increasing.")]
|
||||
fn test_position_panic_if_called_previous_anchor() {
|
||||
let v: Vec<u32> = (0..2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
let mut buf = [0u32; 1];
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 200_000);
|
||||
position_reader.read(230, &mut buf);
|
||||
position_reader.read(9, &mut buf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_positions_bug() {
|
||||
let mut v: Vec<u32> = vec![];
|
||||
for i in 1..200 {
|
||||
for j in 0..i {
|
||||
v.push(j);
|
||||
}
|
||||
}
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
let mut buf = Vec::new();
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
|
||||
let mut offset = 0;
|
||||
for i in 1..24 {
|
||||
buf.resize(i, 0);
|
||||
position_reader.read(offset, &mut buf[..]);
|
||||
offset += i as u64;
|
||||
let r: Vec<u32> = (0..i).map(|el| el as u32).collect();
|
||||
assert_eq!(buf, &r[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_long_skip_const() {
|
||||
const CONST_VAL: u32 = 9u32;
|
||||
@@ -124,7 +177,7 @@ pub mod tests {
|
||||
assert_eq!(stream.len(), 1_000_000);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(&mut buf);
|
||||
position_reader.read(0, &mut buf);
|
||||
assert_eq!(buf[0], CONST_VAL);
|
||||
}
|
||||
|
||||
@@ -143,7 +196,7 @@ pub mod tests {
|
||||
] {
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(&mut buf);
|
||||
position_reader.read(0, &mut buf);
|
||||
assert_eq!(buf[0], offset as u32);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use crate::directory::ReadOnlySource;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::positions::LONG_SKIP_INTERVAL;
|
||||
use crate::positions::LONG_SKIP_IN_BLOCKS;
|
||||
use crate::postings::compression::compressed_block_size;
|
||||
/// Positions works as a long sequence of compressed block.
|
||||
/// All terms are chained one after the other.
|
||||
///
|
||||
@@ -62,22 +61,20 @@ impl Positions {
|
||||
|
||||
fn reader(&self, offset: u64) -> PositionReader {
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
|
||||
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
||||
let mut position_read = OwnedRead::new(self.position_source.clone());
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(self.skip_source.clone());
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
PositionReader {
|
||||
bit_packer: self.bit_packer,
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
block_offset: std::i64::MAX as u64,
|
||||
anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL,
|
||||
abs_offset: offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,51 +82,12 @@ pub struct PositionReader {
|
||||
skip_read: OwnedRead,
|
||||
position_read: OwnedRead,
|
||||
bit_packer: BitPacker4x,
|
||||
inner_offset: usize,
|
||||
buffer: Box<[u32; 128]>,
|
||||
ahead: Option<usize>, // if None, no block is loaded.
|
||||
// if Some(num_blocks), the block currently loaded is num_blocks ahead
|
||||
// of the block of the next int to read.
|
||||
}
|
||||
buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>,
|
||||
|
||||
// `ahead` represents the offset of the block currently loaded
|
||||
// compared to the cursor of the actual stream.
|
||||
//
|
||||
// By contract, when this function is called, the current block has to be
|
||||
// decompressed.
|
||||
//
|
||||
// If the requested number of els ends exactly at a given block, the next
|
||||
// block is not decompressed.
|
||||
fn read_impl(
|
||||
bit_packer: BitPacker4x,
|
||||
mut position: &[u8],
|
||||
buffer: &mut [u32; 128],
|
||||
mut inner_offset: usize,
|
||||
num_bits: &[u8],
|
||||
output: &mut [u32],
|
||||
) -> usize {
|
||||
let mut output_start = 0;
|
||||
let mut output_len = output.len();
|
||||
let mut ahead = 0;
|
||||
loop {
|
||||
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
|
||||
// We have enough elements in the current block.
|
||||
// Let's copy the requested elements in the output buffer,
|
||||
// and return.
|
||||
if output_len <= available_len {
|
||||
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
||||
return ahead;
|
||||
}
|
||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||
output_len -= available_len;
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
bit_packer.decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
}
|
||||
block_offset: u64,
|
||||
anchor_offset: u64,
|
||||
|
||||
abs_offset: u64,
|
||||
}
|
||||
|
||||
impl PositionReader {
|
||||
@@ -141,57 +99,65 @@ impl PositionReader {
|
||||
Positions::new(position_source, skip_source).reader(offset)
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers.
|
||||
/// This does not consume / advance the stream.
|
||||
pub fn read(&mut self, output: &mut [u32]) {
|
||||
let skip_data = self.skip_read.as_ref();
|
||||
let position_data = self.position_read.as_ref();
|
||||
let num_bits = self.skip_read.get(0);
|
||||
if self.ahead != Some(0) {
|
||||
// the block currently available is not the block
|
||||
// for the current position
|
||||
fn advance_num_blocks(&mut self, num_blocks: usize) {
|
||||
let num_bits: usize = self.skip_read.as_ref()[..num_blocks]
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|num_bits| num_bits as usize)
|
||||
.sum();
|
||||
let num_bytes_to_skip = num_bits * COMPRESSION_BLOCK_SIZE / 8;
|
||||
self.skip_read.advance(num_blocks as usize);
|
||||
self.position_read.advance(num_bytes_to_skip);
|
||||
}
|
||||
|
||||
/// Fills a buffer with the positions `[offset..offset+output.len())` integers.
|
||||
///
|
||||
/// `offset` is required to have a value >= to the offsets given in previous calls
|
||||
/// for the given `PositionReaderAbsolute` instance.
|
||||
pub fn read(&mut self, mut offset: u64, mut output: &mut [u32]) {
|
||||
offset += self.abs_offset;
|
||||
assert!(
|
||||
offset >= self.anchor_offset,
|
||||
"offset arguments should be increasing."
|
||||
);
|
||||
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
|
||||
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
|
||||
// The first position is not within the first block.
|
||||
// We need to decompress the first block.
|
||||
let delta_to_anchor_offset = offset - self.anchor_offset;
|
||||
let num_blocks_to_skip =
|
||||
(delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as usize;
|
||||
self.advance_num_blocks(num_blocks_to_skip);
|
||||
self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64);
|
||||
self.block_offset = self.anchor_offset;
|
||||
let num_bits = self.skip_read.get(0);
|
||||
self.bit_packer
|
||||
.decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits);
|
||||
} else {
|
||||
let num_blocks_to_skip =
|
||||
((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as usize;
|
||||
self.advance_num_blocks(num_blocks_to_skip);
|
||||
self.anchor_offset = self.block_offset;
|
||||
}
|
||||
|
||||
let mut num_bits = self.skip_read.get(0);
|
||||
let mut position_data = self.position_read.as_ref();
|
||||
|
||||
for i in 1.. {
|
||||
let offset_in_block = (offset as usize) % COMPRESSION_BLOCK_SIZE;
|
||||
let remaining_in_block = COMPRESSION_BLOCK_SIZE - offset_in_block;
|
||||
if remaining_in_block >= output.len() {
|
||||
output.copy_from_slice(&self.buffer[offset_in_block..][..output.len()]);
|
||||
break;
|
||||
}
|
||||
output[..remaining_in_block].copy_from_slice(&self.buffer[offset_in_block..]);
|
||||
output = &mut output[remaining_in_block..];
|
||||
offset += remaining_in_block as u64;
|
||||
position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..];
|
||||
num_bits = self.skip_read.get(i);
|
||||
self.bit_packer
|
||||
.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.ahead = Some(0);
|
||||
self.block_offset += COMPRESSION_BLOCK_SIZE as u64;
|
||||
}
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.ahead = Some(read_impl(
|
||||
self.bit_packer,
|
||||
&position_data[block_len..],
|
||||
self.buffer.as_mut(),
|
||||
self.inner_offset,
|
||||
&skip_data[1..],
|
||||
output,
|
||||
));
|
||||
}
|
||||
|
||||
/// Skip the next `skip_len` integer.
|
||||
///
|
||||
/// If a full block is skipped, calling
|
||||
/// `.skip(...)` will avoid decompressing it.
|
||||
///
|
||||
/// May panic if the end of the stream is reached.
|
||||
pub fn skip(&mut self, skip_len: usize) {
|
||||
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
|
||||
|
||||
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
|
||||
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
self.ahead = self.ahead.and_then(|num_blocks| {
|
||||
if num_blocks >= num_blocks_to_advance {
|
||||
Some(num_blocks - num_blocks_to_advance)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
.iter()
|
||||
.map(|num_bits| *num_bits as usize)
|
||||
.sum::<usize>()
|
||||
* COMPRESSION_BLOCK_SIZE;
|
||||
let skip_len_in_bytes = skip_len_in_bits / 8;
|
||||
self.skip_read.advance(num_blocks_to_advance);
|
||||
self.position_read.advance(skip_len_in_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
@@ -129,23 +130,18 @@ impl BlockSearcher {
|
||||
///
|
||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
||||
/// then we use a different implementation that does an exhaustive linear search over
|
||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
||||
/// of branch.
|
||||
pub(crate) fn search_in_block(
|
||||
self,
|
||||
block_docs: &AlignedBuffer,
|
||||
len: usize,
|
||||
start: usize,
|
||||
target: u32,
|
||||
) -> usize {
|
||||
/// the block regardless of whether the block is full or not.
|
||||
///
|
||||
/// Indeed, if the block is not full, the remaining items are TERMINATED.
|
||||
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
|
||||
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
||||
if self == BlockSearcher::SSE2 {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
start + galloping(&block_docs.0[start..len], target)
|
||||
galloping(&block_docs.0[..], target)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,6 +162,7 @@ mod tests {
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::BlockSearcher;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
@@ -196,19 +193,12 @@ mod tests {
|
||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
||||
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
for i in 0..cursor {
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(
|
||||
&AlignedBuffer(output_buffer),
|
||||
block.len(),
|
||||
i,
|
||||
target
|
||||
),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
|
||||
427
src/postings/block_segment_postings.rs
Normal file
427
src/postings/block_segment_postings.rs
Normal file
@@ -0,0 +1,427 @@
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::postings::compression::{
|
||||
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
|
||||
};
|
||||
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, TERMINATED};
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
pub(crate) doc_decoder: BlockDecoder,
|
||||
loaded_offset: usize,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
|
||||
data: ReadOnlySource,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn decode_bitpacked_block(
|
||||
doc_decoder: &mut BlockDecoder,
|
||||
freq_decoder_opt: Option<&mut BlockDecoder>,
|
||||
data: &[u8],
|
||||
doc_offset: DocId,
|
||||
doc_num_bits: u8,
|
||||
tf_num_bits: u8,
|
||||
) {
|
||||
let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
|
||||
if let Some(freq_decoder) = freq_decoder_opt {
|
||||
freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_vint_block(
|
||||
doc_decoder: &mut BlockDecoder,
|
||||
freq_decoder_opt: Option<&mut BlockDecoder>,
|
||||
data: &[u8],
|
||||
doc_offset: DocId,
|
||||
num_vint_docs: usize,
|
||||
) {
|
||||
doc_decoder.clear();
|
||||
let num_consumed_bytes = doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs);
|
||||
if let Some(freq_decoder) = freq_decoder_opt {
|
||||
freq_decoder.uncompress_vint_unsorted(&data[num_consumed_bytes..], num_vint_docs);
|
||||
}
|
||||
}
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
data: ReadOnlySource,
|
||||
) -> (Option<ReadOnlySource>, ReadOnlySource) {
|
||||
if doc_freq < COMPRESSION_BLOCK_SIZE as u32 {
|
||||
return (None, data);
|
||||
}
|
||||
let mut data_byte_arr = data.as_slice();
|
||||
let skip_len = VInt::deserialize(&mut data_byte_arr)
|
||||
.expect("Data corrupted")
|
||||
.0 as usize;
|
||||
let vint_len = data.len() - data_byte_arr.len();
|
||||
let (skip_data, postings_data) = data.slice_from(vint_len).split(skip_len);
|
||||
(Some(skip_data), postings_data)
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
|
||||
None => SkipReader::new(ReadOnlySource::empty(), doc_freq, record_option),
|
||||
};
|
||||
|
||||
let doc_freq = doc_freq as usize;
|
||||
let mut block_segment_postings = BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
loaded_offset: std::usize::MAX,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_freq,
|
||||
data: postings_data,
|
||||
skip_reader,
|
||||
};
|
||||
block_segment_postings.advance();
|
||||
block_segment_postings
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: ReadOnlySource) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
self.data = ReadOnlySource::new(postings_data);
|
||||
self.loaded_offset = std::usize::MAX;
|
||||
self.loaded_offset = std::usize::MAX;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data, doc_freq);
|
||||
} else {
|
||||
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
|
||||
}
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline(always)]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
pub fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
|
||||
/// Position on a block that may contains `target_doc`.
|
||||
///
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) {
|
||||
self.skip_reader.seek(target_doc);
|
||||
self.load_block();
|
||||
}
|
||||
|
||||
fn load_block(&mut self) {
|
||||
let offset = self.skip_reader.byte_offset();
|
||||
if self.loaded_offset == offset {
|
||||
return;
|
||||
}
|
||||
self.loaded_offset = offset;
|
||||
match self.skip_reader.block_info() {
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
..
|
||||
} => {
|
||||
decode_bitpacked_block(
|
||||
&mut self.doc_decoder,
|
||||
if let FreqReadingOption::ReadFreq = self.freq_reading_option {
|
||||
Some(&mut self.freq_decoder)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
&self.data.as_slice()[offset..],
|
||||
self.skip_reader.last_doc_in_previous_block,
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
);
|
||||
}
|
||||
BlockInfo::VInt(num_vint_docs) => {
|
||||
decode_vint_block(
|
||||
&mut self.doc_decoder,
|
||||
if let FreqReadingOption::ReadFreq = self.freq_reading_option {
|
||||
Some(&mut self.freq_decoder)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
&self.data.as_slice()[offset..],
|
||||
self.skip_reader.last_doc_in_previous_block,
|
||||
num_vint_docs as usize,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if !self.skip_reader.advance() {
|
||||
return false;
|
||||
}
|
||||
self.load_block();
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
loaded_offset: std::usize::MAX,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
doc_freq: 0,
|
||||
data: ReadOnlySource::new(vec![]),
|
||||
skip_reader: SkipReader::new(ReadOnlySource::new(vec![]), 0, IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BlockSegmentPostings;
|
||||
use crate::common::HasLen;
|
||||
use crate::core::Index;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::postings::Postings;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_postings_doc_returns_terminated() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.doc(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_postings_doc_term_freq_returns_0() {
|
||||
let postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the `doc_freq` is correct
|
||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||
loop {
|
||||
let block = block_segments.docs();
|
||||
for (i, doc) in block.iter().cloned().enumerate() {
|
||||
assert_eq!(offset + (i as u32), doc);
|
||||
}
|
||||
offset += block.len() as u32;
|
||||
if block_segments.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
// 128 is missing
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(128), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert_eq!(docset.advance(), 130);
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(129), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert_eq!(docset.advance(), 130);
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.doc(), 0);
|
||||
assert_eq!(docset.seek(131), TERMINATED);
|
||||
assert_eq!(docset.doc(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
index_writer.add_document(doc!(int_field=>0u64));
|
||||
last_doc = doc + 1;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip2() {
|
||||
let mut docs = vec![0];
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
block_postings.seek(i);
|
||||
let docs = block_postings.docs();
|
||||
assert!(docs[0] <= i);
|
||||
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
||||
}
|
||||
block_postings.seek(100_000);
|
||||
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reset_block_segment_postings() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
// create two postings list, one containg even number,
|
||||
// the other containing odd numbers.
|
||||
for i in 0..6 {
|
||||
let doc = doc!(int_field=> (i % 2) as u64);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
block_segments = inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
|
||||
}
|
||||
assert_eq!(block_segments.docs(), &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::common::FixedSize;
|
||||
use crate::docset::TERMINATED;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
@@ -17,6 +18,12 @@ pub struct BlockEncoder {
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
impl Default for BlockEncoder {
|
||||
fn default() -> Self {
|
||||
BlockEncoder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
@@ -54,11 +61,13 @@ pub struct BlockDecoder {
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
impl Default for BlockDecoder {
|
||||
fn default() -> Self {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockDecoder {
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
bitpacker: BitPacker4x::new(),
|
||||
@@ -90,14 +99,18 @@ impl BlockDecoder {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
(&self.output, self.output_len)
|
||||
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
|
||||
&self.output
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output.0[idx]
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.output.0.iter_mut().for_each(|el| *el = TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait VIntEncoder {
|
||||
@@ -134,9 +147,9 @@ pub trait VIntDecoder {
|
||||
/// For instance, if delta encoded are `1, 3, 9`, and the
|
||||
/// `offset` is 5, then the output will be:
|
||||
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
fn uncompress_vint_sorted(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
compressed_data: &[u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize;
|
||||
@@ -146,7 +159,7 @@ pub trait VIntDecoder {
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
|
||||
fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize;
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
@@ -160,9 +173,9 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
fn uncompress_vint_sorted(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
compressed_data: &[u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
@@ -170,7 +183,7 @@ impl VIntDecoder for BlockDecoder {
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
@@ -186,7 +199,7 @@ pub mod tests {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
@@ -199,9 +212,9 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_encode_sorted_block_with_offset() {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let mut encoder = BlockEncoder::default();
|
||||
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
@@ -216,11 +229,11 @@ pub mod tests {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let mut encoder = BlockEncoder::default();
|
||||
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
|
||||
assert_eq!(consumed_num_bytes, compressed.len() - 1);
|
||||
@@ -236,11 +249,11 @@ pub mod tests {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let mut encoder = BlockEncoder::default();
|
||||
let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
|
||||
assert_eq!(consumed_num_bytes + 1, compressed.len());
|
||||
@@ -251,6 +264,11 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_decoder_initialization() {
|
||||
let block = BlockDecoder::with_val(TERMINATED);
|
||||
assert_eq!(block.output(0), TERMINATED);
|
||||
}
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
@@ -260,7 +278,7 @@ pub mod tests {
|
||||
for offset in &[0u32, 1u32, 2u32] {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::default();
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(consumed_num_bytes, encoded_data.len());
|
||||
|
||||
@@ -42,7 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
|
||||
pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
for output_mut in output.iter_mut() {
|
||||
|
||||
@@ -3,11 +3,8 @@ Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
/// to full-text search.
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
mod recorder;
|
||||
@@ -22,18 +19,17 @@ pub(crate) use self::block_search::BlockSearcher;
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
use self::compression::COMPRESSION_BLOCK_SIZE;
|
||||
pub use self::postings::Postings;
|
||||
pub(crate) use self::skip::SkipReader;
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use crate::common::HasLen;
|
||||
|
||||
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
||||
@@ -51,7 +47,7 @@ pub mod tests {
|
||||
use crate::core::Index;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::SegmentWriter;
|
||||
@@ -115,29 +111,12 @@ pub mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
postings.advance();
|
||||
assert_eq!(postings.doc(), 0);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 1, 2], &positions[..]);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 1, 2], &positions[..]);
|
||||
postings.advance();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
postings.advance();
|
||||
postings.advance();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(postings.advance(), 1);
|
||||
assert_eq!(postings.doc(), 1);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
@@ -146,7 +125,25 @@ pub mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.advance(), 1);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.seek(1), 1);
|
||||
assert_eq!(postings.doc(), 1);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.seek(1002), 1002);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
@@ -155,8 +152,8 @@ pub mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.seek(100), 100);
|
||||
assert_eq!(postings.seek(1002), 1002);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
@@ -281,22 +278,21 @@ pub mod tests {
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_a.len(), 1000);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
assert_eq!(postings_a.term_freq(), 6);
|
||||
postings_a.positions(&mut positions);
|
||||
assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.advance(), 1u32);
|
||||
assert_eq!(postings_a.doc(), 1u32);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.advance(), i);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
postings_a.positions(&mut positions);
|
||||
assert_eq!(&positions[..], [i]);
|
||||
assert_eq!(postings_a.doc(), i);
|
||||
}
|
||||
assert!(!postings_a.advance());
|
||||
assert_eq!(postings_a.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let term_e = Term::from_field_text(text_field, "e");
|
||||
@@ -306,7 +302,6 @@ pub mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_e.advance());
|
||||
assert_eq!(postings_e.term_freq(), i);
|
||||
postings_e.positions(&mut positions);
|
||||
assert_eq!(positions.len(), i as usize);
|
||||
@@ -314,8 +309,9 @@ pub mod tests {
|
||||
assert_eq!(positions[j], (j as u32));
|
||||
}
|
||||
assert_eq!(postings_e.doc(), i);
|
||||
postings_e.advance();
|
||||
}
|
||||
assert!(!postings_e.advance());
|
||||
assert_eq!(postings_e.doc(), TERMINATED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -329,16 +325,8 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "g b b d c g c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "g a b b a d c g c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(text_field => "g b b d c g c"));
|
||||
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
@@ -348,7 +336,6 @@ pub mod tests {
|
||||
.inverted_index(text_field)
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 1u32);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&positions[..], &[1u32, 4]);
|
||||
@@ -370,11 +357,8 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for i in 0..num_docs {
|
||||
let mut doc = Document::default();
|
||||
doc.add_u64(value_field, 2);
|
||||
doc.add_u64(value_field, (i % 2) as u64);
|
||||
|
||||
for i in 0u64..num_docs as u64 {
|
||||
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
@@ -391,11 +375,10 @@ pub mod tests {
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(i), i);
|
||||
assert_eq!(segment_postings.doc(), i);
|
||||
|
||||
assert_eq!(segment_postings.skip_next(j), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(j), j);
|
||||
assert_eq!(segment_postings.doc(), j);
|
||||
}
|
||||
}
|
||||
@@ -407,17 +390,16 @@ pub mod tests {
|
||||
.unwrap();
|
||||
|
||||
// check that `skip_next` advances the iterator
|
||||
assert!(segment_postings.advance());
|
||||
assert_eq!(segment_postings.doc(), 0);
|
||||
|
||||
assert_eq!(segment_postings.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(1), 1);
|
||||
assert_eq!(segment_postings.doc(), 1);
|
||||
|
||||
assert_eq!(segment_postings.skip_next(1), SkipResult::OverStep);
|
||||
assert_eq!(segment_postings.doc(), 2);
|
||||
assert_eq!(segment_postings.seek(1), 1);
|
||||
assert_eq!(segment_postings.doc(), 1);
|
||||
|
||||
// check that going beyond the end is handled
|
||||
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
|
||||
assert_eq!(segment_postings.seek(num_docs), TERMINATED);
|
||||
}
|
||||
|
||||
// check that filtering works
|
||||
@@ -428,7 +410,7 @@ pub mod tests {
|
||||
.unwrap();
|
||||
|
||||
for i in 0..num_docs / 2 {
|
||||
assert_eq!(segment_postings.skip_next(i * 2), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(i * 2), i * 2);
|
||||
assert_eq!(segment_postings.doc(), i * 2);
|
||||
}
|
||||
|
||||
@@ -438,7 +420,7 @@ pub mod tests {
|
||||
.unwrap();
|
||||
|
||||
for i in 0..num_docs / 2 - 1 {
|
||||
assert_eq!(segment_postings.skip_next(i * 2 + 1), SkipResult::OverStep);
|
||||
assert!(segment_postings.seek(i * 2 + 1) > (i * 1) * 2);
|
||||
assert_eq!(segment_postings.doc(), (i + 1) * 2);
|
||||
}
|
||||
}
|
||||
@@ -450,6 +432,7 @@ pub mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
// make sure seeking still works
|
||||
@@ -460,11 +443,11 @@ pub mod tests {
|
||||
.unwrap();
|
||||
|
||||
if i % 2 == 0 {
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(i), i);
|
||||
assert_eq!(segment_postings.doc(), i);
|
||||
assert!(segment_reader.is_deleted(i));
|
||||
} else {
|
||||
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
|
||||
assert_eq!(segment_postings.seek(i), i);
|
||||
assert_eq!(segment_postings.doc(), i);
|
||||
}
|
||||
}
|
||||
@@ -479,12 +462,16 @@ pub mod tests {
|
||||
let mut last = 2; // start from 5 to avoid seeking to 3 twice
|
||||
let mut cur = 3;
|
||||
loop {
|
||||
match segment_postings.skip_next(cur) {
|
||||
SkipResult::End => break,
|
||||
SkipResult::Reached => assert_eq!(segment_postings.doc(), cur),
|
||||
SkipResult::OverStep => assert_eq!(segment_postings.doc(), cur + 1),
|
||||
let seek = segment_postings.seek(cur);
|
||||
if seek == TERMINATED {
|
||||
break;
|
||||
}
|
||||
assert_eq!(seek, segment_postings.doc());
|
||||
if seek == cur {
|
||||
assert_eq!(segment_postings.doc(), cur);
|
||||
} else {
|
||||
assert_eq!(segment_postings.doc(), cur + 1);
|
||||
}
|
||||
|
||||
let next = cur + last;
|
||||
last = cur;
|
||||
cur = next;
|
||||
@@ -570,7 +557,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
@@ -596,30 +583,22 @@ pub mod tests {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
let skip_result_opt = postings_opt.seek(target);
|
||||
let skip_result_unopt = postings_unopt.seek(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
assert!(skip_result_opt >= target);
|
||||
assert_eq!(skip_result_opt, postings_opt.doc());
|
||||
if skip_result_opt == TERMINATED {
|
||||
return;
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
while postings_opt.doc() != TERMINATED {
|
||||
assert_eq!(postings_opt.doc(), postings_unopt.doc());
|
||||
assert_eq!(postings_opt.advance(), postings_unopt.advance());
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -628,7 +607,7 @@ pub mod tests {
|
||||
mod bench {
|
||||
|
||||
use super::tests::*;
|
||||
use crate::docset::SkipResult;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::Intersection;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::tests;
|
||||
@@ -646,7 +625,7 @@ mod bench {
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
while segment_postings.advance() != TERMINATED {}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -678,7 +657,7 @@ mod bench {
|
||||
segment_postings_c,
|
||||
segment_postings_d,
|
||||
]);
|
||||
while intersection.advance() {}
|
||||
while intersection.advance() != TERMINATED {}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -694,11 +673,10 @@ mod bench {
|
||||
.unwrap();
|
||||
|
||||
let mut existing_docs = Vec::new();
|
||||
segment_postings.advance();
|
||||
for doc in &docs {
|
||||
if *doc >= segment_postings.doc() {
|
||||
existing_docs.push(*doc);
|
||||
if segment_postings.skip_next(*doc) == SkipResult::End {
|
||||
if segment_postings.seek(*doc) == TERMINATED {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -710,7 +688,7 @@ mod bench {
|
||||
.read_postings(&*TERM_A, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
for doc in &existing_docs {
|
||||
if segment_postings.skip_next(*doc) == SkipResult::End {
|
||||
if segment_postings.seek(*doc) == TERMINATED {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -749,8 +727,9 @@ mod bench {
|
||||
.read_postings(&*TERM_A, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
let mut s = 0u32;
|
||||
while segment_postings.advance() {
|
||||
while segment_postings.doc() != TERMINATED {
|
||||
s += (segment_postings.doc() & n) % 1024;
|
||||
segment_postings.advance()
|
||||
}
|
||||
s
|
||||
});
|
||||
|
||||
@@ -1,56 +1,19 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::common::HasLen;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
|
||||
use crate::docset::DocSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::postings::BlockSearcher;
|
||||
use crate::postings::FreqReadingOption;
|
||||
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::SkipReader;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use owned_read::OwnedRead;
|
||||
use std::cmp::Ordering;
|
||||
use tantivy_fst::Streamer;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
// before reading positions.
|
||||
//
|
||||
// if none, position are already loaded in
|
||||
// the positions vec.
|
||||
position_to_skip: usize,
|
||||
position_reader: PositionReader,
|
||||
}
|
||||
|
||||
impl PositionComputer {
|
||||
pub fn new(position_reader: PositionReader) -> PositionComputer {
|
||||
PositionComputer {
|
||||
position_to_skip: 0,
|
||||
position_reader,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_skip(&mut self, num_skip: usize) {
|
||||
self.position_to_skip += num_skip;
|
||||
}
|
||||
|
||||
// Positions can only be read once.
|
||||
pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
|
||||
self.position_reader.skip(self.position_to_skip);
|
||||
self.position_to_skip = 0;
|
||||
self.position_reader.read(output);
|
||||
let mut cum = offset;
|
||||
for output_mut in output.iter_mut() {
|
||||
cum += *output_mut;
|
||||
*output_mut = cum;
|
||||
}
|
||||
}
|
||||
}
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::postings::BlockSegmentPostings;
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
@@ -60,18 +23,17 @@ impl PositionComputer {
|
||||
pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_computer: Option<PositionComputer>,
|
||||
position_reader: Option<PositionReader>,
|
||||
block_searcher: BlockSearcher,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> Self {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
block_cursor: BlockSegmentPostings::empty(),
|
||||
cur: 0,
|
||||
position_reader: None,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
@@ -97,15 +59,13 @@ impl SegmentPostings {
|
||||
}
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len() as u32,
|
||||
OwnedRead::new(buffer),
|
||||
ReadOnlySource::from(buffer),
|
||||
IndexRecordOption::Basic,
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
@@ -114,12 +74,12 @@ impl SegmentPostings {
|
||||
/// frequencies and/or positions
|
||||
pub(crate) fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings,
|
||||
positions_stream_opt: Option<PositionReader>,
|
||||
position_reader: Option<PositionReader>,
|
||||
) -> SegmentPostings {
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||
cur: 0, // cursor within the block
|
||||
position_reader,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
@@ -129,134 +89,52 @@ impl DocSet for SegmentPostings {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
position_computer.add_skip(term_freq);
|
||||
}
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
fn advance(&mut self) -> DocId {
|
||||
if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
self.block_cursor.advance();
|
||||
} else {
|
||||
self.cur += 1;
|
||||
}
|
||||
true
|
||||
self.doc()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
_ => {
|
||||
// ...
|
||||
}
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
if self.doc() == target {
|
||||
return target;
|
||||
}
|
||||
self.block_cursor.seek(target);
|
||||
|
||||
// In the following, thanks to the call to advance above,
|
||||
// we know that the position is not loaded and we need
|
||||
// to skip every doc_freq we cross.
|
||||
// At this point we are on the block, that might contain our document.
|
||||
let output = self.block_cursor.docs_aligned();
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
// check if we need to go to the next block
|
||||
let mut sum_freqs_skipped: u32 = 0;
|
||||
if !self
|
||||
.block_cursor
|
||||
.docs()
|
||||
.last()
|
||||
.map(|doc| *doc >= target)
|
||||
.unwrap_or(false)
|
||||
// there should always be at least a document in the block
|
||||
// since advance returned.
|
||||
{
|
||||
// we are not in the right block.
|
||||
//
|
||||
// First compute all of the freqs skipped from the current block.
|
||||
if self.position_computer.is_some() {
|
||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||
match self.block_cursor.skip_to(target) {
|
||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||
sum_freqs_skipped += block_skip_freqs;
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
} else if self.block_cursor.skip_to(target)
|
||||
== BlockSegmentPostingsSkipResult::Terminated
|
||||
{
|
||||
// no positions needed. no need to sum freqs.
|
||||
return SkipResult::End;
|
||||
}
|
||||
self.cur = 0;
|
||||
}
|
||||
self.cur = self.block_searcher.search_in_block(&output, target);
|
||||
|
||||
let cur = self.cur;
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let (output, len) = self.block_cursor.docs_aligned();
|
||||
let new_cur = self
|
||||
.block_searcher
|
||||
.search_in_block(&output, len, cur, target);
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
||||
position_computer.add_skip(sum_freqs_skipped as usize);
|
||||
}
|
||||
self.cur = new_cur;
|
||||
// The last block is not full and padded with the value TERMINATED,
|
||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
||||
// that is greater or equal to the target.
|
||||
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = output.0[new_cur];
|
||||
|
||||
// If all docs are smaller than target the current block should be incomplemented and padded
|
||||
// with the value `TERMINATED`.
|
||||
//
|
||||
// After the search, the cursor should point to the first value of TERMINATED.
|
||||
let doc = output.0[self.cur];
|
||||
debug_assert!(doc >= target);
|
||||
if doc == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
doc
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
||||
);
|
||||
docs[self.cur]
|
||||
self.block_cursor.doc(self.cur)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
// finish the current block
|
||||
if self.advance() {
|
||||
for &doc in &self.block_cursor.docs()[self.cur..] {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
// ... iterate through the remaining blocks.
|
||||
while self.block_cursor.advance() {
|
||||
for &doc in self.block_cursor.docs() {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
@@ -290,515 +168,52 @@ impl Postings for SegmentPostings {
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_comp) = self.position_computer.as_mut() {
|
||||
if let Some(position_reader) = self.position_reader.as_mut() {
|
||||
let read_offset = self.block_cursor.position_offset()
|
||||
+ (self.block_cursor.freqs()[..self.cur]
|
||||
.iter()
|
||||
.cloned()
|
||||
.sum::<u32>() as u64);
|
||||
output.resize(term_freq, 0u32);
|
||||
position_comp.positions_with_offset(offset, &mut output[..]);
|
||||
position_reader.read(read_offset, &mut output[..]);
|
||||
let mut cum = offset;
|
||||
for output_mut in output.iter_mut() {
|
||||
cum += *output_mut;
|
||||
*output_mut = cum;
|
||||
}
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
|
||||
num_vint_docs: usize,
|
||||
|
||||
remaining_data: OwnedRead,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
postings_data.advance(skip_len);
|
||||
data.clip(skip_len);
|
||||
(Some(data), postings_data)
|
||||
} else {
|
||||
(None, data)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
remaining_data: postings_data,
|
||||
skip_reader,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
|
||||
} else {
|
||||
skip_freqs += self.skip_reader.tf_sum();
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
BlockSegmentPostingsSkipResult::Success(skip_freqs)
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
type Item = &'b [DocId];
|
||||
|
||||
fn next(&'b mut self) -> Option<&'b [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
|
||||
use super::SegmentPostings;
|
||||
use crate::common::HasLen;
|
||||
use crate::core::Index;
|
||||
use crate::docset::DocSet;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::postings::Postings;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::DocId;
|
||||
use crate::SkipResult;
|
||||
use tantivy_fst::Streamer;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert!(!postings.advance());
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_doc_called_before_advance() {
|
||||
SegmentPostings::empty().doc();
|
||||
fn test_empty_postings_doc_returns_terminated() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.doc(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_freq_called_before_advance() {
|
||||
SegmentPostings::empty().term_freq();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
assert!(!postings.advance());
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
// checking that the `doc_freq` is correct
|
||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||
while let Some(block) = block_segments.next() {
|
||||
for (i, doc) in block.iter().cloned().enumerate() {
|
||||
assert_eq!(offset + (i as u32), doc);
|
||||
}
|
||||
offset += block.len() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
index_writer.add_document(doc!(int_field=>0u64));
|
||||
last_doc = doc + 1;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip2() {
|
||||
let mut docs = vec![0];
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
);
|
||||
let docs = block_postings.docs();
|
||||
assert!(docs[0] <= i);
|
||||
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
||||
}
|
||||
assert_eq!(
|
||||
block_postings.skip_to(100_000),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(101_000),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reset_block_segment_postings() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
// create two postings list, one containg even number,
|
||||
// the other containing odd numbers.
|
||||
for i in 0..6 {
|
||||
let doc = doc!(int_field=> (i % 2) as u64);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
block_segments = inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert_eq!(block_segments.docs(), &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
fn test_empty_postings_doc_term_freq_returns_0() {
|
||||
let postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use crate::directory::WritePtr;
|
||||
use crate::positions::PositionSerializer;
|
||||
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::skip::SkipSerializer;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::{Field, FieldEntry, FieldType};
|
||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
@@ -391,7 +390,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
}
|
||||
self.block.clear();
|
||||
}
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
let skip_data = self.skip_write.data();
|
||||
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
|
||||
self.output_write.write_all(skip_data)?;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use crate::{DocId, TERMINATED};
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
pub struct SkipSerializer {
|
||||
@@ -50,80 +51,143 @@ impl SkipSerializer {
|
||||
}
|
||||
|
||||
pub(crate) struct SkipReader {
|
||||
doc: DocId,
|
||||
last_doc_in_block: DocId,
|
||||
pub(crate) last_doc_in_previous_block: DocId,
|
||||
owned_read: OwnedRead,
|
||||
doc_num_bits: u8,
|
||||
tf_num_bits: u8,
|
||||
tf_sum: u32,
|
||||
skip_info: IndexRecordOption,
|
||||
byte_offset: usize,
|
||||
remaining_docs: u32, // number of docs remaining, including the
|
||||
// documents in the current block.
|
||||
block_info: BlockInfo,
|
||||
|
||||
position_offset: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Eq, PartialEq, Copy, Debug)]
|
||||
pub(crate) enum BlockInfo {
|
||||
BitPacked {
|
||||
doc_num_bits: u8,
|
||||
tf_num_bits: u8,
|
||||
tf_sum: u32,
|
||||
},
|
||||
VInt(u32),
|
||||
}
|
||||
|
||||
impl Default for BlockInfo {
|
||||
fn default() -> Self {
|
||||
BlockInfo::VInt(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl SkipReader {
|
||||
pub fn new(data: OwnedRead, skip_info: IndexRecordOption) -> SkipReader {
|
||||
pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
|
||||
SkipReader {
|
||||
doc: 0u32,
|
||||
owned_read: data,
|
||||
last_doc_in_block: 0u32,
|
||||
last_doc_in_previous_block: 0u32,
|
||||
owned_read: OwnedRead::new(data),
|
||||
skip_info,
|
||||
doc_num_bits: 0u8,
|
||||
tf_num_bits: 0u8,
|
||||
tf_sum: 0u32,
|
||||
block_info: BlockInfo::default(),
|
||||
byte_offset: 0,
|
||||
remaining_docs: doc_freq,
|
||||
position_offset: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, data: OwnedRead) {
|
||||
self.doc = 0u32;
|
||||
self.owned_read = data;
|
||||
self.doc_num_bits = 0u8;
|
||||
self.tf_num_bits = 0u8;
|
||||
self.tf_sum = 0u32;
|
||||
pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
|
||||
self.last_doc_in_block = 0u32;
|
||||
self.last_doc_in_previous_block = 0u32;
|
||||
self.owned_read = OwnedRead::new(data);
|
||||
self.block_info = BlockInfo::default();
|
||||
self.byte_offset = 0;
|
||||
self.remaining_docs = doc_freq;
|
||||
}
|
||||
|
||||
pub fn total_block_len(&self) -> usize {
|
||||
(self.doc_num_bits + self.tf_num_bits) as usize * COMPRESSION_BLOCK_SIZE / 8
|
||||
#[cfg(test)]
|
||||
#[inline(always)]
|
||||
pub(crate) fn last_doc_in_block(&self) -> DocId {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
pub fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
pub fn position_offset(&self) -> u64 {
|
||||
self.position_offset
|
||||
}
|
||||
|
||||
pub fn doc_num_bits(&self) -> u8 {
|
||||
self.doc_num_bits
|
||||
pub fn byte_offset(&self) -> usize {
|
||||
self.byte_offset
|
||||
}
|
||||
|
||||
/// Number of bits used to encode term frequencies
|
||||
fn read_block_info(&mut self) {
|
||||
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
|
||||
self.last_doc_in_block += doc_delta as DocId;
|
||||
let doc_num_bits = self.owned_read.get(0);
|
||||
match self.skip_info {
|
||||
IndexRecordOption::Basic => {
|
||||
self.owned_read.advance(1);
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0,
|
||||
};
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
let tf_num_bits = self.owned_read.get(1);
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
tf_sum: 0,
|
||||
};
|
||||
self.owned_read.advance(2);
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
let tf_num_bits = self.owned_read.get(1);
|
||||
self.owned_read.advance(2);
|
||||
let tf_sum = u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
tf_sum,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn block_info(&self) -> BlockInfo {
|
||||
self.block_info
|
||||
}
|
||||
|
||||
/// Advance the skip reader to the block that may contain the target.
|
||||
///
|
||||
/// 0 if term frequencies are not enabled.
|
||||
pub fn tf_num_bits(&self) -> u8 {
|
||||
self.tf_num_bits
|
||||
}
|
||||
|
||||
pub fn tf_sum(&self) -> u32 {
|
||||
self.tf_sum
|
||||
/// If the target is larger than all documents, the skip_reader
|
||||
/// then advance to the last Variable In block.
|
||||
pub fn seek(&mut self, target: DocId) {
|
||||
while self.last_doc_in_block < target {
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.owned_read.as_ref().is_empty() {
|
||||
false
|
||||
} else {
|
||||
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
|
||||
self.doc += doc_delta as DocId;
|
||||
self.doc_num_bits = self.owned_read.get(0);
|
||||
match self.skip_info {
|
||||
IndexRecordOption::Basic => {
|
||||
self.owned_read.advance(1);
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
self.tf_num_bits = self.owned_read.get(1);
|
||||
self.owned_read.advance(2);
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
self.tf_num_bits = self.owned_read.get(1);
|
||||
self.owned_read.advance(2);
|
||||
self.tf_sum =
|
||||
u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
|
||||
}
|
||||
match self.block_info {
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
tf_sum,
|
||||
} => {
|
||||
self.remaining_docs -= COMPRESSION_BLOCK_SIZE as u32;
|
||||
self.byte_offset += compressed_block_size(doc_num_bits + tf_num_bits);
|
||||
self.position_offset += tf_sum as u64;
|
||||
}
|
||||
BlockInfo::VInt(num_vint_docs) => {
|
||||
self.remaining_docs -= num_vint_docs;
|
||||
}
|
||||
}
|
||||
self.last_doc_in_previous_block = self.last_doc_in_block;
|
||||
if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
self.read_block_info();
|
||||
true
|
||||
} else {
|
||||
self.last_doc_in_block = TERMINATED;
|
||||
self.block_info = BlockInfo::VInt(self.remaining_docs);
|
||||
self.remaining_docs > 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -131,9 +195,11 @@ impl SkipReader {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BlockInfo;
|
||||
use super::IndexRecordOption;
|
||||
use super::{SkipReader, SkipSerializer};
|
||||
use owned_read::OwnedRead;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
#[test]
|
||||
fn test_skip_with_freq() {
|
||||
@@ -145,15 +211,34 @@ mod tests {
|
||||
skip_serializer.write_term_freq(2u8);
|
||||
skip_serializer.data().to_owned()
|
||||
};
|
||||
let mut skip_reader = SkipReader::new(OwnedRead::new(buf), IndexRecordOption::WithFreqs);
|
||||
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32;
|
||||
let mut skip_reader = SkipReader::new(
|
||||
ReadOnlySource::new(buf),
|
||||
doc_freq,
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 1u32);
|
||||
assert_eq!(skip_reader.doc_num_bits(), 2u8);
|
||||
assert_eq!(skip_reader.tf_num_bits(), 3u8);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
tf_num_bits: 3u8,
|
||||
tf_sum: 0
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 5u32);
|
||||
assert_eq!(skip_reader.doc_num_bits(), 5u8);
|
||||
assert_eq!(skip_reader.tf_num_bits(), 2u8);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 5u8,
|
||||
tf_num_bits: 2u8,
|
||||
tf_sum: 0
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
|
||||
assert!(!skip_reader.advance());
|
||||
}
|
||||
|
||||
@@ -165,13 +250,60 @@ mod tests {
|
||||
skip_serializer.write_doc(5u32, 5u8);
|
||||
skip_serializer.data().to_owned()
|
||||
};
|
||||
let mut skip_reader = SkipReader::new(OwnedRead::new(buf), IndexRecordOption::Basic);
|
||||
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32;
|
||||
let mut skip_reader = SkipReader::new(
|
||||
ReadOnlySource::from(buf),
|
||||
doc_freq,
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 1u32);
|
||||
assert_eq!(skip_reader.doc_num_bits(), 2u8);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 5u32);
|
||||
assert_eq!(skip_reader.doc_num_bits(), 5u8);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 5u8,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
|
||||
assert!(!skip_reader.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_multiple_of_block_size() {
|
||||
let buf = {
|
||||
let mut skip_serializer = SkipSerializer::new();
|
||||
skip_serializer.write_doc(1u32, 2u8);
|
||||
skip_serializer.data().to_owned()
|
||||
};
|
||||
let doc_freq = COMPRESSION_BLOCK_SIZE as u32;
|
||||
let mut skip_reader = SkipReader::new(
|
||||
ReadOnlySource::from(buf),
|
||||
doc_freq,
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32
|
||||
}
|
||||
);
|
||||
assert!(!skip_reader.advance());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::store;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::core::Searcher;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::boost_query::BoostScorer;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
@@ -25,7 +25,6 @@ pub struct AllWeight;
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<Box<dyn Scorer>> {
|
||||
let all_scorer = AllScorer {
|
||||
state: State::NotStarted,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
};
|
||||
@@ -40,39 +39,20 @@ impl Weight for AllWeight {
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
NotStarted,
|
||||
Started,
|
||||
Finished,
|
||||
}
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
state: State,
|
||||
doc: DocId,
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
match self.state {
|
||||
State::NotStarted => {
|
||||
self.state = State::Started;
|
||||
self.doc = 0;
|
||||
}
|
||||
State::Started => {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
State::Finished => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if self.doc < self.max_doc {
|
||||
true
|
||||
} else {
|
||||
self.state = State::Finished;
|
||||
false
|
||||
fn advance(&mut self) -> DocId {
|
||||
if self.doc + 1 >= self.max_doc {
|
||||
self.doc = TERMINATED;
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc += 1;
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -93,6 +73,7 @@ impl Scorer for AllScorer {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::AllQuery;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::Query;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
@@ -120,18 +101,16 @@ mod tests {
|
||||
{
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut scorer = weight.scorer(reader, 1.0f32).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.advance(), 1u32);
|
||||
assert_eq!(scorer.doc(), 1u32);
|
||||
assert!(!scorer.advance());
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let reader = searcher.segment_reader(1);
|
||||
let mut scorer = weight.scorer(reader, 1.0f32).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(!scorer.advance());
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -144,13 +123,11 @@ mod tests {
|
||||
let reader = searcher.segment_reader(0);
|
||||
{
|
||||
let mut scorer = weight.scorer(reader, 2.0f32).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert_eq!(scorer.score(), 2.0f32);
|
||||
}
|
||||
{
|
||||
let mut scorer = weight.scorer(reader, 1.5f32).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert_eq!(scorer.score(), 1.5f32);
|
||||
}
|
||||
|
||||
@@ -6,8 +6,8 @@ use crate::query::{Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::TantivyError;
|
||||
use crate::{Result, SkipResult};
|
||||
use std::sync::Arc;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
@@ -51,10 +51,13 @@ where
|
||||
let term_info = term_stream.value();
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
while block_segment_postings.advance() {
|
||||
loop {
|
||||
for &doc in block_segment_postings.docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
if !block_segment_postings.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
@@ -64,7 +67,7 @@ where
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
if scorer.skip_next(doc) == SkipResult::Reached {
|
||||
if scorer.seek(doc) == doc {
|
||||
Ok(Explanation::new("AutomatonScorer", 1.0f32))
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
@@ -77,6 +80,7 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::AutomatonWeight;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::{Schema, STRING};
|
||||
use crate::Index;
|
||||
@@ -141,13 +145,12 @@ mod tests {
|
||||
let mut scorer = automaton_weight
|
||||
.scorer(searcher.segment_reader(0u32), 1.0f32)
|
||||
.unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert_eq!(scorer.score(), 1.0f32);
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.advance(), 2u32);
|
||||
assert_eq!(scorer.doc(), 2u32);
|
||||
assert_eq!(scorer.score(), 1.0f32);
|
||||
assert!(!scorer.advance());
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -160,7 +163,6 @@ mod tests {
|
||||
let mut scorer = automaton_weight
|
||||
.scorer(searcher.segment_reader(0u32), 1.32f32)
|
||||
.unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert_eq!(scorer.score(), 1.32f32);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crate::common::{BitSet, TinySet};
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::DocId;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
|
||||
///
|
||||
@@ -33,75 +32,50 @@ impl From<BitSet> for BitSetDocSet {
|
||||
} else {
|
||||
docs.tinyset(0)
|
||||
};
|
||||
BitSetDocSet {
|
||||
let mut docset = BitSetDocSet {
|
||||
docs,
|
||||
cursor_bucket: 0,
|
||||
cursor_tinybitset: first_tiny_bitset,
|
||||
doc: 0u32,
|
||||
}
|
||||
};
|
||||
docset.advance();
|
||||
docset
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BitSetDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
|
||||
self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
|
||||
return true;
|
||||
return self.doc;
|
||||
}
|
||||
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
|
||||
self.go_to_bucket(cursor_bucket);
|
||||
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
|
||||
self.doc = (cursor_bucket * 64u32) | lower;
|
||||
true
|
||||
self.doc
|
||||
} else {
|
||||
false
|
||||
self.doc = TERMINATED;
|
||||
TERMINATED
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
// skip is required to advance.
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let target_bucket = target / 64u32;
|
||||
|
||||
// Mask for all of the bits greater or equal
|
||||
// to our target document.
|
||||
match target_bucket.cmp(&self.cursor_bucket) {
|
||||
Ordering::Greater => {
|
||||
self.go_to_bucket(target_bucket);
|
||||
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
|
||||
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
|
||||
if !self.advance() {
|
||||
SkipResult::End
|
||||
} else if self.doc() == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
Ordering::Equal => loop {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Less => {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
debug_assert!(self.doc() > target);
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
},
|
||||
Ordering::Less => {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
if target_bucket > self.cursor_bucket {
|
||||
self.go_to_bucket(target_bucket);
|
||||
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
|
||||
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
|
||||
self.advance();
|
||||
}
|
||||
let mut doc = self.doc();
|
||||
while doc < target {
|
||||
doc = self.advance();
|
||||
}
|
||||
doc
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
@@ -122,7 +96,7 @@ impl DocSet for BitSetDocSet {
|
||||
mod tests {
|
||||
use super::BitSetDocSet;
|
||||
use crate::common::BitSet;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::DocId;
|
||||
|
||||
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
|
||||
@@ -133,19 +107,24 @@ mod tests {
|
||||
BitSetDocSet::from(docset)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let bitset = BitSet::with_max_value(1000);
|
||||
let mut empty = BitSetDocSet::from(bitset);
|
||||
assert_eq!(empty.advance(), TERMINATED)
|
||||
}
|
||||
|
||||
fn test_go_through_sequential(docs: &[DocId]) {
|
||||
let mut docset = create_docbitset(docs, 1_000u32);
|
||||
for &doc in docs {
|
||||
assert!(docset.advance());
|
||||
assert_eq!(doc, docset.doc());
|
||||
docset.advance();
|
||||
}
|
||||
assert!(!docset.advance());
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docbitset_sequential() {
|
||||
test_go_through_sequential(&[]);
|
||||
test_go_through_sequential(&[1, 2, 3]);
|
||||
test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]);
|
||||
test_go_through_sequential(&[63, 64, 65]);
|
||||
@@ -156,64 +135,64 @@ mod tests {
|
||||
fn test_docbitset_skip() {
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
||||
assert_eq!(docset.skip_next(7), SkipResult::Reached);
|
||||
assert_eq!(docset.seek(7), 7);
|
||||
assert_eq!(docset.doc(), 7);
|
||||
assert!(docset.advance(), 7);
|
||||
assert_eq!(docset.advance(), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
||||
assert_eq!(docset.skip_next(3), SkipResult::OverStep);
|
||||
assert_eq!(docset.seek(3), 5);
|
||||
assert_eq!(docset.doc(), 5);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 6);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
|
||||
assert_eq!(docset.seek(5112), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5113), SkipResult::End);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.seek(5113), TERMINATED);
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.seek(5111), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
|
||||
assert_eq!(docset.seek(5112), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 5500);
|
||||
assert_eq!(docset.doc(), 5500);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 6666);
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.seek(5111), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 5500);
|
||||
assert_eq!(docset.doc(), 5500);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 6666);
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.seek(5111), 5112);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 5513);
|
||||
assert_eq!(docset.doc(), 5513);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.advance(), 6666);
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,6 +202,7 @@ mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::BitSetDocSet;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::test;
|
||||
use crate::tests;
|
||||
use crate::DocSet;
|
||||
@@ -257,7 +237,7 @@ mod bench {
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
while docset.advance() != TERMINATED {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::core::SegmentReader;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::EmptyScorer;
|
||||
use crate::query::Exclude;
|
||||
use crate::query::Occur;
|
||||
@@ -10,16 +11,21 @@ use crate::query::Scorer;
|
||||
use crate::query::Union;
|
||||
use crate::query::Weight;
|
||||
use crate::query::{intersect_scorers, Explanation};
|
||||
use crate::{DocId, SkipResult};
|
||||
use crate::{DocId, Score};
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer>
|
||||
enum SpecializedScorer<TScoreCombiner: ScoreCombiner> {
|
||||
TermUnion(Union<TermScorer, TScoreCombiner>),
|
||||
Other(Box<dyn Scorer>),
|
||||
}
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> SpecializedScorer<TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 {
|
||||
return scorers.into_iter().next().unwrap(); //< we checked the size beforehands
|
||||
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehands
|
||||
}
|
||||
|
||||
{
|
||||
@@ -29,14 +35,21 @@ where
|
||||
.into_iter()
|
||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
let scorer: Box<dyn Scorer> =
|
||||
Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
return SpecializedScorer::TermUnion(Union::<TermScorer, TScoreCombiner>::from(
|
||||
scorers,
|
||||
));
|
||||
}
|
||||
}
|
||||
SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers)))
|
||||
}
|
||||
|
||||
let scorer: Box<dyn Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
|
||||
scorer
|
||||
impl<TScoreCombiner: ScoreCombiner> Into<Box<dyn Scorer>> for SpecializedScorer<TScoreCombiner> {
|
||||
fn into(self) -> Box<dyn Scorer> {
|
||||
match self {
|
||||
Self::TermUnion(union) => Box::new(union),
|
||||
Self::Other(scorer) => scorer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BooleanWeight {
|
||||
@@ -72,41 +85,50 @@ impl BooleanWeight {
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
boost: f32,
|
||||
) -> crate::Result<Box<dyn Scorer>> {
|
||||
) -> crate::Result<SpecializedScorer<TScoreCombiner>> {
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||
|
||||
let should_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
let should_scorer_opt: Option<SpecializedScorer<TScoreCombiner>> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
|
||||
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::MustNot)
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
.map(scorer_union::<TScoreCombiner>)
|
||||
.map(Into::into);
|
||||
|
||||
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Must)
|
||||
.map(intersect_scorers);
|
||||
|
||||
let positive_scorer: Box<dyn Scorer> = match (should_scorer_opt, must_scorer_opt) {
|
||||
(Some(should_scorer), Some(must_scorer)) => {
|
||||
if self.scoring_enabled {
|
||||
Box::new(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
must_scorer,
|
||||
should_scorer,
|
||||
))
|
||||
} else {
|
||||
must_scorer
|
||||
let positive_scorer: SpecializedScorer<TScoreCombiner> =
|
||||
match (should_scorer_opt, must_scorer_opt) {
|
||||
(Some(should_scorer), Some(must_scorer)) => {
|
||||
if self.scoring_enabled {
|
||||
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
|
||||
Box<dyn Scorer>,
|
||||
Box<dyn Scorer>,
|
||||
TScoreCombiner,
|
||||
>::new(
|
||||
must_scorer, should_scorer.into()
|
||||
)))
|
||||
} else {
|
||||
SpecializedScorer::Other(must_scorer)
|
||||
}
|
||||
}
|
||||
}
|
||||
(None, Some(must_scorer)) => must_scorer,
|
||||
(Some(should_scorer), None) => should_scorer,
|
||||
(None, None) => {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
};
|
||||
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
|
||||
(Some(should_scorer), None) => should_scorer,
|
||||
(None, None) => {
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||
Ok(Box::new(Exclude::new(positive_scorer, exclude_scorer)))
|
||||
let positive_scorer_boxed: Box<dyn Scorer> = positive_scorer.into();
|
||||
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
||||
positive_scorer_boxed,
|
||||
exclude_scorer,
|
||||
))))
|
||||
} else {
|
||||
Ok(positive_scorer)
|
||||
}
|
||||
@@ -126,14 +148,16 @@ impl Weight for BooleanWeight {
|
||||
}
|
||||
} else if self.scoring_enabled {
|
||||
self.complex_scorer::<SumWithCoordsCombiner>(reader, boost)
|
||||
.map(Into::into)
|
||||
} else {
|
||||
self.complex_scorer::<DoNothingCombiner>(reader, boost)
|
||||
.map(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
if !self.scoring_enabled {
|
||||
@@ -150,6 +174,51 @@ impl Weight for BooleanWeight {
|
||||
}
|
||||
Ok(explanation)
|
||||
}
|
||||
|
||||
fn for_each(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0f32)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(mut union_scorer) => {
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_scorer(scorer.as_mut(), callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calls `callback` with all of the `(doc, score)` for which score
|
||||
/// is exceeding a given threshold.
|
||||
///
|
||||
/// This method is useful for the TopDocs collector.
|
||||
/// For all docsets, the blanket implementation has the benefit
|
||||
/// of prefiltering (doc, score) pairs, avoiding the
|
||||
/// virtual dispatch cost.
|
||||
///
|
||||
/// More importantly, it makes it possible for scorers to implement
|
||||
/// important optimization (e.g. BlockWAND for union).
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: f32,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0f32)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(mut union_scorer) => {
|
||||
for_each_pruning_scorer(&mut union_scorer, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn is_positive_occur(occur: Occur) -> bool {
|
||||
|
||||
@@ -207,7 +207,6 @@ mod tests {
|
||||
let mut boolean_scorer = boolean_weight
|
||||
.scorer(searcher.segment_reader(0u32), 1.0f32)
|
||||
.unwrap();
|
||||
assert!(boolean_scorer.advance());
|
||||
assert_eq!(boolean_scorer.doc(), 0u32);
|
||||
assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
|
||||
}
|
||||
@@ -215,7 +214,6 @@ mod tests {
|
||||
let mut boolean_scorer = boolean_weight
|
||||
.scorer(searcher.segment_reader(0u32), 2.0f32)
|
||||
.unwrap();
|
||||
assert!(boolean_scorer.advance());
|
||||
assert_eq!(boolean_scorer.doc(), 0u32);
|
||||
assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Searcher, SegmentReader, SkipResult, Term};
|
||||
use crate::{DocId, DocSet, Searcher, SegmentReader, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
|
||||
@@ -72,7 +71,7 @@ impl Weight for BoostWeight {
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut explanation =
|
||||
@@ -99,12 +98,12 @@ impl<S: Scorer> BoostScorer<S> {
|
||||
}
|
||||
|
||||
impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.underlying.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.underlying.skip_next(target)
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.underlying.seek(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
@@ -119,10 +118,6 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
self.underlying.size_hint()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
self.underlying.append_to_bitset(bitset)
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
self.underlying.count(delete_bitset)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::Scorer;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::Weight;
|
||||
use crate::query::{Explanation, Query};
|
||||
@@ -48,15 +49,12 @@ impl Weight for EmptyWeight {
|
||||
pub struct EmptyScorer;
|
||||
|
||||
impl DocSet for EmptyScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
false
|
||||
fn advance(&mut self) -> DocId {
|
||||
TERMINATED
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
panic!(
|
||||
"You may not call .doc() on a scorer \
|
||||
where the last call to advance() did not return true."
|
||||
);
|
||||
TERMINATED
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
@@ -72,18 +70,15 @@ impl Scorer for EmptyScorer {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::EmptyScorer;
|
||||
use crate::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_empty_scorer() {
|
||||
let mut empty_scorer = EmptyScorer;
|
||||
assert!(!empty_scorer.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_empty_scorer_panic_on_doc_call() {
|
||||
EmptyScorer.doc();
|
||||
assert_eq!(empty_scorer.doc(), TERMINATED);
|
||||
assert_eq!(empty_scorer.advance(), TERMINATED);
|
||||
assert_eq!(empty_scorer.doc(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,41 +1,37 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
ExcludeOne(DocId),
|
||||
Finished,
|
||||
}
|
||||
|
||||
/// Filters a given `DocSet` by removing the docs from a given `DocSet`.
|
||||
///
|
||||
/// The excluding docset has no impact on scoring.
|
||||
pub struct Exclude<TDocSet, TDocSetExclude> {
|
||||
underlying_docset: TDocSet,
|
||||
excluding_docset: TDocSetExclude,
|
||||
excluding_state: State,
|
||||
}
|
||||
|
||||
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
|
||||
where
|
||||
TDocSet: DocSet,
|
||||
TDocSetExclude: DocSet,
|
||||
{
|
||||
/// Creates a new `ExcludeScorer`
|
||||
pub fn new(
|
||||
underlying_docset: TDocSet,
|
||||
mut underlying_docset: TDocSet,
|
||||
mut excluding_docset: TDocSetExclude,
|
||||
) -> Exclude<TDocSet, TDocSetExclude> {
|
||||
let state = if excluding_docset.advance() {
|
||||
State::ExcludeOne(excluding_docset.doc())
|
||||
} else {
|
||||
State::Finished
|
||||
};
|
||||
while underlying_docset.doc() != TERMINATED {
|
||||
let target = underlying_docset.doc();
|
||||
if excluding_docset.seek(target) != target {
|
||||
// this document is not excluded.
|
||||
break;
|
||||
}
|
||||
underlying_docset.advance();
|
||||
}
|
||||
Exclude {
|
||||
underlying_docset,
|
||||
excluding_docset,
|
||||
excluding_state: state,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,28 +47,7 @@ where
|
||||
/// increasing `doc`.
|
||||
fn accept(&mut self) -> bool {
|
||||
let doc = self.underlying_docset.doc();
|
||||
match self.excluding_state {
|
||||
State::ExcludeOne(excluded_doc) => {
|
||||
if doc == excluded_doc {
|
||||
return false;
|
||||
}
|
||||
if excluded_doc > doc {
|
||||
return true;
|
||||
}
|
||||
match self.excluding_docset.skip_next(doc) {
|
||||
SkipResult::OverStep => {
|
||||
self.excluding_state = State::ExcludeOne(self.excluding_docset.doc());
|
||||
true
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.excluding_state = State::Finished;
|
||||
true
|
||||
}
|
||||
SkipResult::Reached => false,
|
||||
}
|
||||
}
|
||||
State::Finished => true,
|
||||
}
|
||||
self.excluding_docset.seek(doc) != doc
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,27 +56,24 @@ where
|
||||
TDocSet: DocSet,
|
||||
TDocSetExclude: DocSet,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.underlying_docset.advance() {
|
||||
fn advance(&mut self) -> DocId {
|
||||
while self.underlying_docset.advance() != TERMINATED {
|
||||
if self.accept() {
|
||||
return true;
|
||||
return self.doc();
|
||||
}
|
||||
}
|
||||
false
|
||||
TERMINATED
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let underlying_skip_result = self.underlying_docset.skip_next(target);
|
||||
if underlying_skip_result == SkipResult::End {
|
||||
return SkipResult::End;
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let underlying_seek_result = self.underlying_docset.seek(target);
|
||||
if underlying_seek_result == TERMINATED {
|
||||
return TERMINATED;
|
||||
}
|
||||
if self.accept() {
|
||||
underlying_skip_result
|
||||
} else if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
SkipResult::End
|
||||
return underlying_seek_result;
|
||||
}
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -141,8 +113,9 @@ mod tests {
|
||||
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
|
||||
);
|
||||
let mut els = vec![];
|
||||
while exclude_scorer.advance() {
|
||||
while exclude_scorer.doc() != TERMINATED {
|
||||
els.push(exclude_scorer.doc());
|
||||
exclude_scorer.advance();
|
||||
}
|
||||
assert_eq!(els, vec![5, 8, 15]);
|
||||
}
|
||||
|
||||
@@ -117,7 +117,7 @@ impl FuzzyTermQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
|
||||
/// Creates a new Fuzzy Query of the Term prefix
|
||||
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
@@ -188,6 +188,8 @@ mod test {
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// passes because Levenshtein distance is 1 (substitute 'o' with 'a')
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
|
||||
@@ -200,6 +202,18 @@ mod test {
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
|
||||
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jap");
|
||||
|
||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 0, "Expected no document");
|
||||
}
|
||||
|
||||
// passes because prefix Levenshtein distance is 0
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jap");
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::EmptyScorer;
|
||||
use crate::query::Scorer;
|
||||
@@ -20,12 +20,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
|
||||
if scorers.len() == 1 {
|
||||
return scorers.pop().unwrap();
|
||||
}
|
||||
scorers.sort_by_key(|scorer| scorer.size_hint());
|
||||
let doc = go_to_first_doc(&mut scorers[..]);
|
||||
if doc == TERMINATED {
|
||||
return Box::new(EmptyScorer);
|
||||
}
|
||||
// We know that we have at least 2 elements.
|
||||
let num_docsets = scorers.len();
|
||||
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
||||
let left = scorers.pop().unwrap();
|
||||
let right = scorers.pop().unwrap();
|
||||
scorers.reverse();
|
||||
let left = scorers.remove(0);
|
||||
let right = scorers.remove(0);
|
||||
let all_term_scorers = [&left, &right]
|
||||
.iter()
|
||||
.all(|&scorer| scorer.is::<TermScorer>());
|
||||
@@ -34,14 +36,12 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
|
||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -50,22 +50,34 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
|
||||
left: TDocSet,
|
||||
right: TDocSet,
|
||||
others: Vec<TOtherDocSet>,
|
||||
num_docsets: usize,
|
||||
}
|
||||
|
||||
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
|
||||
let mut candidate = 0;
|
||||
'outer: loop {
|
||||
for docset in docsets.iter_mut() {
|
||||
let seek_doc = docset.seek(candidate);
|
||||
if seek_doc > candidate {
|
||||
candidate = docset.doc();
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> {
|
||||
let num_docsets = docsets.len();
|
||||
assert!(num_docsets >= 2);
|
||||
docsets.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
||||
let left = docsets.pop().unwrap();
|
||||
let right = docsets.pop().unwrap();
|
||||
docsets.reverse();
|
||||
docsets.sort_by_key(|docset| docset.size_hint());
|
||||
go_to_first_doc(&mut docsets);
|
||||
let left = docsets.remove(0);
|
||||
let right = docsets.remove(0);
|
||||
Intersection {
|
||||
left,
|
||||
right,
|
||||
others: docsets,
|
||||
num_docsets,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,128 +92,44 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
|
||||
pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut dyn DocSet {
|
||||
match ord {
|
||||
0 => &mut self.left,
|
||||
1 => &mut self.right,
|
||||
n => &mut self.others[n - 2],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
let (left, right) = (&mut self.left, &mut self.right);
|
||||
|
||||
if !left.advance() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut candidate = left.doc();
|
||||
let mut other_candidate_ord: usize = usize::max_value();
|
||||
let mut candidate = left.advance();
|
||||
|
||||
'outer: loop {
|
||||
// In the first part we look for a document in the intersection
|
||||
// of the two rarest `DocSet` in the intersection.
|
||||
|
||||
loop {
|
||||
match right.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = right.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
let right_doc = right.seek(candidate);
|
||||
candidate = left.seek(right_doc);
|
||||
if candidate == right_doc {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert_eq!(left.doc(), right.doc());
|
||||
// test the remaining scorers;
|
||||
for (ord, docset) in self.others.iter_mut().enumerate() {
|
||||
if ord == other_candidate_ord {
|
||||
continue;
|
||||
}
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
for docset in self.others.iter_mut() {
|
||||
let seek_doc = docset.seek(candidate);
|
||||
if seek_doc > candidate {
|
||||
candidate = left.seek(seek_doc);
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
// We optimize skipping by skipping every single member
|
||||
// of the intersection to target.
|
||||
let mut current_target: DocId = target;
|
||||
let mut current_ord = self.num_docsets;
|
||||
|
||||
'outer: loop {
|
||||
for ord in 0..self.num_docsets {
|
||||
let docset = self.docset_mut(ord);
|
||||
if ord == current_ord {
|
||||
continue;
|
||||
}
|
||||
match docset.skip_next(current_target) {
|
||||
SkipResult::End => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
// update the target
|
||||
// for the remaining members of the intersection.
|
||||
current_target = docset.doc();
|
||||
current_ord = ord;
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::Reached => {}
|
||||
}
|
||||
}
|
||||
if target == current_target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
assert!(current_target > target);
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.left.seek(target);
|
||||
let mut docsets: Vec<&mut dyn DocSet> = vec![&mut self.left, &mut self.right];
|
||||
for docset in &mut self.others {
|
||||
docsets.push(docset);
|
||||
}
|
||||
go_to_first_doc(&mut docsets[..])
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -228,7 +156,7 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Intersection;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::VecDocSet;
|
||||
|
||||
@@ -238,20 +166,18 @@ mod tests {
|
||||
let left = VecDocSet::from(vec![1, 3, 9]);
|
||||
let right = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.advance(), 9);
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
assert_eq!(intersection.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let a = VecDocSet::from(vec![1, 3, 9]);
|
||||
let b = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let c = VecDocSet::from(vec![1, 5, 9, 111]);
|
||||
let mut intersection = Intersection::new(vec![a, b, c]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
assert_eq!(intersection.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,8 +186,8 @@ mod tests {
|
||||
let left = VecDocSet::from(vec![0]);
|
||||
let right = VecDocSet::from(vec![0]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 0);
|
||||
assert_eq!(intersection.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -269,7 +195,7 @@ mod tests {
|
||||
let left = VecDocSet::from(vec![0, 1, 2, 4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
let mut intersection = Intersection::new(vec![left, right]);
|
||||
assert_eq!(intersection.skip_next(2), SkipResult::Reached);
|
||||
assert_eq!(intersection.seek(2), 2);
|
||||
assert_eq!(intersection.doc(), 2);
|
||||
}
|
||||
|
||||
@@ -312,7 +238,7 @@ mod tests {
|
||||
let a = VecDocSet::from(vec![1, 3]);
|
||||
let b = VecDocSet::from(vec![1, 4]);
|
||||
let c = VecDocSet::from(vec![3, 9]);
|
||||
let mut intersection = Intersection::new(vec![a, b, c]);
|
||||
assert!(!intersection.advance());
|
||||
let intersection = Intersection::new(vec![a, b, c]);
|
||||
assert_eq!(intersection.doc(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
/*!
|
||||
Query
|
||||
*/
|
||||
/*! Query Module */
|
||||
|
||||
mod all_query;
|
||||
mod automaton_weight;
|
||||
@@ -24,7 +22,6 @@ mod term_query;
|
||||
mod union;
|
||||
mod weight;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod vec_docset;
|
||||
|
||||
@@ -43,8 +40,9 @@ pub use self::boost_query::BoostQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::fuzzy_query::DFAWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::Query;
|
||||
|
||||
@@ -60,8 +60,8 @@ pub mod tests {
|
||||
.map(|docaddr| docaddr.1)
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
||||
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
||||
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
||||
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
||||
assert!(test_query(vec!["g", "a"]).is_empty());
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
@@ -25,12 +25,12 @@ impl<TPostings: Postings> PostingsWithOffset<TPostings> {
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.postings.seek(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -149,7 +149,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
PhraseScorer {
|
||||
let mut scorer = PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
num_terms: num_docsets,
|
||||
left: Vec::with_capacity(100),
|
||||
@@ -158,7 +158,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
score_needed,
|
||||
};
|
||||
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
|
||||
scorer.advance();
|
||||
}
|
||||
scorer
|
||||
}
|
||||
|
||||
pub fn phrase_count(&self) -> u32 {
|
||||
@@ -225,31 +229,21 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.intersection_docset.advance() {
|
||||
if self.phrase_match() {
|
||||
return true;
|
||||
fn advance(&mut self) -> DocId {
|
||||
loop {
|
||||
let doc = self.intersection_docset.advance();
|
||||
if doc == TERMINATED || self.phrase_match() {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if self.intersection_docset.skip_next(target) == SkipResult::End {
|
||||
return SkipResult::End;
|
||||
}
|
||||
if self.phrase_match() {
|
||||
if self.doc() == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
SkipResult::End
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let doc = self.intersection_docset.seek(target);
|
||||
if doc == TERMINATED || self.phrase_match() {
|
||||
return doc;
|
||||
}
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::query::Weight;
|
||||
use crate::query::{EmptyScorer, Explanation};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Result;
|
||||
use crate::{DocId, DocSet};
|
||||
use crate::{Result, SkipResult};
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
@@ -99,7 +99,7 @@ impl Weight for PhraseWeight {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut scorer = scorer_opt.unwrap();
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader);
|
||||
@@ -114,6 +114,7 @@ impl Weight for PhraseWeight {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::tests::create_index;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::{DocSet, Term};
|
||||
|
||||
@@ -132,12 +133,11 @@ mod tests {
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0f32)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(phrase_scorer.advance());
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert!(phrase_scorer.advance());
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert!(!phrase_scorer.advance());
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,8 +113,9 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
///
|
||||
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using
|
||||
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
|
||||
/// The terms are then searched within the default terms of the query parser.
|
||||
/// tantivy's [`SimpleTokenizer`](tantivy::tokenizer::SimpleTokenizer), hence
|
||||
/// becoming `["barack", "obama"]`. The terms are then searched within
|
||||
/// the default terms of the query parser.
|
||||
///
|
||||
/// e.g. If `body` and `title` are default fields, our example terms are
|
||||
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::schema::Type;
|
||||
use crate::schema::{Field, IndexRecordOption, Term};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::DocId;
|
||||
use crate::{Result, SkipResult};
|
||||
use crate::Result;
|
||||
use std::collections::Bound;
|
||||
use std::ops::Range;
|
||||
|
||||
@@ -300,10 +300,13 @@ impl Weight for RangeWeight {
|
||||
let term_info = term_range.value();
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
while block_segment_postings.advance() {
|
||||
loop {
|
||||
for &doc in block_segment_postings.docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
if !block_segment_postings.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
@@ -312,7 +315,7 @@ impl Weight for RangeWeight {
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(Explanation::new("RangeQuery", 1.0f32))
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::score_combiner::ScoreCombiner;
|
||||
use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use std::cmp::Ordering;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Given a required scorer and an optional scorer
|
||||
@@ -17,7 +16,6 @@ pub struct RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
|
||||
req_scorer: TReqScorer,
|
||||
opt_scorer: TOptScorer,
|
||||
score_cache: Option<Score>,
|
||||
opt_finished: bool,
|
||||
_phantom: PhantomData<TScoreCombiner>,
|
||||
}
|
||||
|
||||
@@ -29,14 +27,12 @@ where
|
||||
/// Creates a new `RequiredOptionalScorer`.
|
||||
pub fn new(
|
||||
req_scorer: TReqScorer,
|
||||
mut opt_scorer: TOptScorer,
|
||||
opt_scorer: TOptScorer,
|
||||
) -> RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
|
||||
let opt_finished = !opt_scorer.advance();
|
||||
RequiredOptionalScorer {
|
||||
req_scorer,
|
||||
opt_scorer,
|
||||
score_cache: None,
|
||||
opt_finished,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -48,7 +44,7 @@ where
|
||||
TReqScorer: DocSet,
|
||||
TOptScorer: DocSet,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.score_cache = None;
|
||||
self.req_scorer.advance()
|
||||
}
|
||||
@@ -76,22 +72,8 @@ where
|
||||
let doc = self.doc();
|
||||
let mut score_combiner = TScoreCombiner::default();
|
||||
score_combiner.update(&mut self.req_scorer);
|
||||
if !self.opt_finished {
|
||||
match self.opt_scorer.doc().cmp(&doc) {
|
||||
Ordering::Greater => {}
|
||||
Ordering::Equal => {
|
||||
score_combiner.update(&mut self.opt_scorer);
|
||||
}
|
||||
Ordering::Less => match self.opt_scorer.skip_next(doc) {
|
||||
SkipResult::Reached => {
|
||||
score_combiner.update(&mut self.opt_scorer);
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.opt_finished = true;
|
||||
}
|
||||
SkipResult::OverStep => {}
|
||||
},
|
||||
}
|
||||
if self.opt_scorer.seek(doc) == doc {
|
||||
score_combiner.update(&mut self.opt_scorer);
|
||||
}
|
||||
let score = score_combiner.score();
|
||||
self.score_cache = Some(score);
|
||||
@@ -102,7 +84,7 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RequiredOptionalScorer;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::score_combiner::{DoNothingCombiner, SumCombiner};
|
||||
use crate::query::ConstScorer;
|
||||
@@ -119,8 +101,9 @@ mod tests {
|
||||
ConstScorer::from(VecDocSet::from(vec![])),
|
||||
);
|
||||
let mut docs = vec![];
|
||||
while reqoptscorer.advance() {
|
||||
while reqoptscorer.doc() != TERMINATED {
|
||||
docs.push(reqoptscorer.doc());
|
||||
reqoptscorer.advance();
|
||||
}
|
||||
assert_eq!(docs, req);
|
||||
}
|
||||
@@ -133,46 +116,45 @@ mod tests {
|
||||
ConstScorer::new(VecDocSet::from(vec![1, 2, 7, 11, 12, 15]), 1.0f32),
|
||||
);
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 1);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 3);
|
||||
assert_eq!(reqoptscorer.doc(), 3);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 7);
|
||||
assert_eq!(reqoptscorer.doc(), 7);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 8);
|
||||
assert_eq!(reqoptscorer.doc(), 8);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 9);
|
||||
assert_eq!(reqoptscorer.doc(), 9);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 10);
|
||||
assert_eq!(reqoptscorer.doc(), 10);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 13);
|
||||
assert_eq!(reqoptscorer.doc(), 13);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), 15);
|
||||
assert_eq!(reqoptscorer.doc(), 15);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
assert!(!reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::DocSet;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use downcast_rs::impl_downcast;
|
||||
@@ -13,14 +12,6 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
fn score(&mut self) -> Score;
|
||||
|
||||
/// Iterates through all of the document matched by the DocSet
|
||||
/// `DocSet` and push the scored documents to the collector.
|
||||
fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) {
|
||||
while self.advance() {
|
||||
callback(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl_downcast!(Scorer);
|
||||
@@ -29,11 +20,6 @@ impl Scorer for Box<dyn Scorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.deref_mut().score()
|
||||
}
|
||||
|
||||
fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) {
|
||||
let scorer = self.deref_mut();
|
||||
scorer.for_each(callback);
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
|
||||
@@ -61,12 +47,12 @@ impl<TDocSet: DocSet> From<TDocSet> for ConstScorer<TDocSet> {
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.docset.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.docset.skip_next(target)
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.docset.seek(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
@@ -80,10 +66,6 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docset.size_hint()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
self.docset.append_to_bitset(bitset);
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
|
||||
|
||||
@@ -26,10 +26,8 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
let doc = doc!(text_field => "a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
let doc = doc!(text_field => "a");
|
||||
index_writer.add_document(doc);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -40,7 +38,6 @@ mod tests {
|
||||
let term_weight = term_query.weight(&searcher, true).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut term_scorer = term_weight.scorer(segment_reader, 1.0f32).unwrap();
|
||||
assert!(term_scorer.advance());
|
||||
assert_eq!(term_scorer.doc(), 0);
|
||||
assert_eq!(term_scorer.score(), 0.28768212);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
@@ -45,12 +45,12 @@ impl TermScorer {
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.postings.seek(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
|
||||
@@ -4,12 +4,13 @@ use crate::docset::DocSet;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::Weight;
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Term;
|
||||
use crate::{Result, SkipResult};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
pub struct TermWeight {
|
||||
term: Term,
|
||||
@@ -25,7 +26,7 @@ impl Weight for TermWeight {
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(scorer.explain())
|
||||
@@ -43,6 +44,39 @@ impl Weight for TermWeight {
|
||||
.unwrap_or(0))
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates through all of the document matched by the DocSet
|
||||
/// `DocSet` and push the scored documents to the collector.
|
||||
fn for_each(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
|
||||
for_each_scorer(&mut scorer, callback);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calls `callback` with all of the `(doc, score)` for which score
|
||||
/// is exceeding a given threshold.
|
||||
///
|
||||
/// This method is useful for the TopDocs collector.
|
||||
/// For all docsets, the blanket implementation has the benefit
|
||||
/// of prefiltering (doc, score) pairs, avoiding the
|
||||
/// virtual dispatch cost.
|
||||
///
|
||||
/// More importantly, it makes it possible for scorers to implement
|
||||
/// important optimization (e.g. BlockWAND for union).
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: f32,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
for_each_pruning_scorer(&mut scorer, threshold, callback);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl TermWeight {
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use crate::common::TinySet;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
@@ -47,17 +46,9 @@ where
|
||||
fn from(docsets: Vec<TScorer>) -> Union<TScorer, TScoreCombiner> {
|
||||
let non_empty_docsets: Vec<TScorer> = docsets
|
||||
.into_iter()
|
||||
.flat_map(
|
||||
|mut docset| {
|
||||
if docset.advance() {
|
||||
Some(docset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.filter(|docset| docset.doc() != TERMINATED)
|
||||
.collect();
|
||||
Union {
|
||||
let mut union = Union {
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||
scores: Box::new([TScoreCombiner::default(); HORIZON as usize]),
|
||||
@@ -65,7 +56,13 @@ where
|
||||
offset: 0,
|
||||
doc: 0,
|
||||
score: 0f32,
|
||||
};
|
||||
if union.refill() {
|
||||
union.advance();
|
||||
} else {
|
||||
union.doc = TERMINATED;
|
||||
}
|
||||
union
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,7 +83,7 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
||||
let delta = doc - min_doc;
|
||||
bitsets[(delta / 64) as usize].insert_mut(delta % 64u32);
|
||||
score_combiner[delta as usize].update(scorer);
|
||||
if !scorer.advance() {
|
||||
if scorer.advance() == TERMINATED {
|
||||
// remove the docset, it has been entirely consumed.
|
||||
return true;
|
||||
}
|
||||
@@ -99,6 +96,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
|
||||
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
|
||||
self.offset = min_doc;
|
||||
self.cursor = 0;
|
||||
self.doc = min_doc;
|
||||
refill(
|
||||
&mut self.docsets,
|
||||
&mut *self.bitsets,
|
||||
@@ -133,30 +131,23 @@ where
|
||||
TScorer: Scorer,
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
fn advance(&mut self) -> DocId {
|
||||
if self.advance_buffered() {
|
||||
return true;
|
||||
return self.doc;
|
||||
}
|
||||
if self.refill() {
|
||||
self.advance();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
if !self.refill() {
|
||||
self.doc = TERMINATED;
|
||||
return TERMINATED;
|
||||
}
|
||||
if !self.advance_buffered() {
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
match self.doc.cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Less => {}
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
if self.doc >= target {
|
||||
return self.doc;
|
||||
}
|
||||
let gap = target - self.offset;
|
||||
if gap < HORIZON {
|
||||
@@ -174,18 +165,11 @@ where
|
||||
|
||||
// Advancing until we reach the end of the bucket
|
||||
// or we reach a doc greater or equal to the target.
|
||||
while self.advance() {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Less => {}
|
||||
}
|
||||
let mut doc = self.doc();
|
||||
while doc < target {
|
||||
doc = self.advance();
|
||||
}
|
||||
SkipResult::End
|
||||
doc
|
||||
} else {
|
||||
// clear the buffered info.
|
||||
for obsolete_tinyset in self.bitsets.iter_mut() {
|
||||
@@ -199,45 +183,42 @@ where
|
||||
// advance all docsets to a doc >= to the target.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
|
||||
unordered_drain_filter(&mut self.docsets, |docset| {
|
||||
if docset.doc() < target {
|
||||
if docset.skip_next(target) == SkipResult::End {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
docset.seek(target) == TERMINATED
|
||||
});
|
||||
|
||||
// at this point all of the docsets
|
||||
// are positionned on a doc >= to the target.
|
||||
if self.refill() {
|
||||
self.advance();
|
||||
if self.doc() == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
SkipResult::End
|
||||
if !self.refill() {
|
||||
self.doc = TERMINATED;
|
||||
return TERMINATED;
|
||||
}
|
||||
self.advance()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO implement `count` efficiently.
|
||||
// TODO Also implement `count` with deletes efficiently.
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
0u32
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
if self.doc == TERMINATED {
|
||||
return 0;
|
||||
}
|
||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||
.iter()
|
||||
.map(|bitset| bitset.len())
|
||||
.sum::<u32>();
|
||||
.sum::<u32>()
|
||||
+ 1;
|
||||
for bitset in self.bitsets.iter_mut() {
|
||||
bitset.clear();
|
||||
}
|
||||
@@ -267,7 +248,7 @@ mod tests {
|
||||
|
||||
use super::Union;
|
||||
use super::HORIZON;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::ConstScorer;
|
||||
@@ -296,12 +277,12 @@ mod tests {
|
||||
};
|
||||
let mut union: Union<_, DoNothingCombiner> = make_union();
|
||||
let mut count = 0;
|
||||
while union.advance() {
|
||||
assert!(union_expected.advance());
|
||||
while union.doc() != TERMINATED {
|
||||
assert_eq!(union_expected.doc(), union.doc());
|
||||
assert_eq!(union_expected.advance(), union.advance());
|
||||
count += 1;
|
||||
}
|
||||
assert!(!union_expected.advance());
|
||||
assert_eq!(union_expected.advance(), TERMINATED);
|
||||
assert_eq!(count, make_union().count_including_deleted());
|
||||
}
|
||||
|
||||
@@ -329,9 +310,7 @@ mod tests {
|
||||
fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
|
||||
let mut btree_set = BTreeSet::new();
|
||||
for docs in docs_list {
|
||||
for &doc in docs.iter() {
|
||||
btree_set.insert(doc);
|
||||
}
|
||||
btree_set.extend(docs.iter().cloned());
|
||||
}
|
||||
let docset_factory = || {
|
||||
let res: Box<dyn DocSet> = Box::new(Union::<_, DoNothingCombiner>::from(
|
||||
@@ -346,10 +325,10 @@ mod tests {
|
||||
};
|
||||
let mut docset = docset_factory();
|
||||
for el in btree_set {
|
||||
assert!(docset.advance());
|
||||
assert_eq!(el, docset.doc());
|
||||
docset.advance();
|
||||
}
|
||||
assert!(!docset.advance());
|
||||
assert_eq!(docset.doc(), TERMINATED);
|
||||
test_skip_against_unoptimized(docset_factory, skip_targets);
|
||||
}
|
||||
|
||||
@@ -372,10 +351,10 @@ mod tests {
|
||||
ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
|
||||
ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
|
||||
]);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 0u32);
|
||||
assert_eq!(docset.skip_next(0u32), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 1u32)
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.doc(), 0u32)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use crate::common::HasLen;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::DocId;
|
||||
use std::num::Wrapping;
|
||||
|
||||
/// Simulate a `Postings` objects from a `VecPostings`.
|
||||
/// `VecPostings` only exist for testing purposes.
|
||||
@@ -12,26 +11,30 @@ use std::num::Wrapping;
|
||||
/// No positions are returned.
|
||||
pub struct VecDocSet {
|
||||
doc_ids: Vec<DocId>,
|
||||
cursor: Wrapping<usize>,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl From<Vec<DocId>> for VecDocSet {
|
||||
fn from(doc_ids: Vec<DocId>) -> VecDocSet {
|
||||
VecDocSet {
|
||||
doc_ids,
|
||||
cursor: Wrapping(usize::max_value()),
|
||||
}
|
||||
VecDocSet { doc_ids, cursor: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for VecDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.doc_ids.len() > self.cursor.0
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.cursor += 1;
|
||||
if self.cursor >= self.doc_ids.len() {
|
||||
self.cursor = self.doc_ids.len();
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc_ids[self.cursor.0]
|
||||
if self.cursor == self.doc_ids.len() {
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc_ids[self.cursor]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
@@ -49,22 +52,21 @@ impl HasLen for VecDocSet {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::docset::DocSet;
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut postings = VecDocSet::from(doc_ids);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.advance(), 3u32);
|
||||
assert_eq!(postings.doc(), 3u32);
|
||||
assert_eq!(postings.skip_next(14u32), SkipResult::OverStep);
|
||||
assert_eq!(postings.seek(14u32), 15u32);
|
||||
assert_eq!(postings.doc(), 15u32);
|
||||
assert_eq!(postings.skip_next(300u32), SkipResult::Reached);
|
||||
assert_eq!(postings.seek(300u32), 300u32);
|
||||
assert_eq!(postings.doc(), 300u32);
|
||||
assert_eq!(postings.skip_next(6000u32), SkipResult::End);
|
||||
assert_eq!(postings.seek(6000u32), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,7 +1,45 @@
|
||||
use super::Scorer;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Explanation;
|
||||
use crate::DocId;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
/// Iterates through all of the document matched by the DocSet
|
||||
/// `DocSet` and push the scored documents to the collector.
|
||||
pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
|
||||
scorer: &mut TScorer,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) {
|
||||
let mut doc = scorer.doc();
|
||||
while doc != TERMINATED {
|
||||
callback(doc, scorer.score());
|
||||
doc = scorer.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls `callback` with all of the `(doc, score)` for which score
|
||||
/// is exceeding a given threshold.
|
||||
///
|
||||
/// This method is useful for the TopDocs collector.
|
||||
/// For all docsets, the blanket implementation has the benefit
|
||||
/// of prefiltering (doc, score) pairs, avoiding the
|
||||
/// virtual dispatch cost.
|
||||
///
|
||||
/// More importantly, it makes it possible for scorers to implement
|
||||
/// important optimization (e.g. BlockWAND for union).
|
||||
pub(crate) fn for_each_pruning_scorer<TScorer: Scorer + ?Sized>(
|
||||
scorer: &mut TScorer,
|
||||
mut threshold: f32,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
let mut doc = scorer.doc();
|
||||
while doc != TERMINATED {
|
||||
let score = scorer.score();
|
||||
if score > threshold {
|
||||
threshold = callback(doc, score);
|
||||
}
|
||||
doc = scorer.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
@@ -27,4 +65,37 @@ pub trait Weight: Send + Sync + 'static {
|
||||
Ok(scorer.count_including_deleted())
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates through all of the document matched by the DocSet
|
||||
/// `DocSet` and push the scored documents to the collector.
|
||||
fn for_each(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
for_each_scorer(scorer.as_mut(), callback);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calls `callback` with all of the `(doc, score)` for which score
|
||||
/// is exceeding a given threshold.
|
||||
///
|
||||
/// This method is useful for the TopDocs collector.
|
||||
/// For all docsets, the blanket implementation has the benefit
|
||||
/// of prefiltering (doc, score) pairs, avoiding the
|
||||
/// virtual dispatch cost.
|
||||
///
|
||||
/// More importantly, it makes it possible for scorers to implement
|
||||
/// important optimization (e.g. BlockWAND for union).
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: f32,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.scorer(reader, 1.0f32)?;
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::directory::META_LOCK;
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
@@ -60,7 +61,6 @@ impl IndexReaderBuilder {
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
/// TODO(pmasurel) Use the `TryInto` trait once it is available in stable.
|
||||
pub fn try_into(self) -> crate::Result<IndexReader> {
|
||||
let inner_reader = InnerIndexReader {
|
||||
index: self.index,
|
||||
@@ -113,6 +113,14 @@ impl IndexReaderBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
type Error = crate::TantivyError;
|
||||
|
||||
fn try_into(self) -> crate::Result<IndexReader> {
|
||||
IndexReaderBuilder::try_into(self)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerIndexReader {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
|
||||
@@ -3,7 +3,6 @@ use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use serde;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::mem;
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use serde;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
@@ -13,13 +12,13 @@ pub struct Field(u32);
|
||||
|
||||
impl Field {
|
||||
/// Create a new field object for the given FieldId.
|
||||
pub fn from_field_id(field_id: u32) -> Field {
|
||||
pub const fn from_field_id(field_id: u32) -> Field {
|
||||
Field(field_id)
|
||||
}
|
||||
|
||||
/// Returns a u32 identifying uniquely a field within a schema.
|
||||
#[allow(clippy::trivially_copy_pass_by_ref)]
|
||||
pub fn field_id(&self) -> u32 {
|
||||
pub const fn field_id(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Value;
|
||||
use serde;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, serde::Serialize, serde::Deserialize)]
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use snap;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
|
||||
@@ -434,8 +434,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() {
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
use crate::query::DFAWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_fst;
|
||||
use tantivy_fst::raw::Fst;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user