mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 15:22:55 +00:00
Compare commits
1 Commits
elshize-bm
...
cutt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
95f1114870 |
@@ -1,8 +1,6 @@
|
||||
Tantivy 0.13.0
|
||||
======================
|
||||
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
|
||||
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
|
||||
- `MMapDirectory::open` does not return a `Result` anymore.
|
||||
|
||||
Tantivy 0.12.0
|
||||
======================
|
||||
|
||||
@@ -18,7 +18,7 @@ byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
tantivy-fst = "0.3"
|
||||
tantivy-fst = {path="../tantivy-fst", version="0.3"}
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = "1"
|
||||
@@ -45,12 +45,11 @@ fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.4"
|
||||
fail = "0.3"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1.0"
|
||||
rayon = "1"
|
||||
# ordered-float = "1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3"
|
||||
@@ -59,11 +58,9 @@ winapi = "0.3"
|
||||
rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
proptest = "0.9"
|
||||
float-cmp = "0.6"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
version = "0.3"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
|
||||
10
README.md
10
README.md
@@ -31,16 +31,12 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
|
||||
In general, Tantivy tends to be
|
||||
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
|
||||
- faster than Lucene on intersection and phrase queries.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::doc;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient");
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/garlic"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let facets = vec![
|
||||
Facet::from("/ingredient/egg"),
|
||||
Facet::from("/ingredient/oil"),
|
||||
Facet::from("/ingredient/garlic"),
|
||||
Facet::from("/ingredient/mushroom"),
|
||||
];
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, &key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
let query_ords: HashSet<u64> = facets
|
||||
.iter()
|
||||
.filter_map(|key| facet_dict.term_ord(key.encoded_str()))
|
||||
.collect();
|
||||
|
||||
let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);
|
||||
|
||||
move |doc: DocId, original_score: Score| {
|
||||
ingredient_reader.facet_ords(doc, &mut facet_ords_buffer);
|
||||
let missing_ingredients = facet_ords_buffer
|
||||
.iter()
|
||||
.filter(|ord| !query_ords.contains(ord))
|
||||
.count();
|
||||
let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
|
||||
|
||||
original_score * tweak
|
||||
}
|
||||
});
|
||||
let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
|
||||
|
||||
let titles: Vec<String> = top_docs
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -28,7 +28,7 @@ where
|
||||
/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
|
||||
pub trait CustomSegmentScorer<TScore>: 'static {
|
||||
/// Computes the score of a specific `doc`.
|
||||
fn score(&mut self, doc: DocId) -> TScore;
|
||||
fn score(&self, doc: DocId) -> TScore;
|
||||
}
|
||||
|
||||
/// `CustomScorer` makes it possible to define any kind of score.
|
||||
@@ -117,9 +117,9 @@ where
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId) -> TScore,
|
||||
F: 'static + Sync + Send + Fn(DocId) -> TScore,
|
||||
{
|
||||
fn score(&mut self, doc: DocId) -> TScore {
|
||||
fn score(&self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use crate::{DocId, Searcher, Executor};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
@@ -100,9 +100,6 @@ mod top_collector;
|
||||
|
||||
mod top_score_collector;
|
||||
pub use self::top_score_collector::TopDocs;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::top_score_collector::TopScoreSegmentCollector;
|
||||
|
||||
|
||||
mod custom_score_top_collector;
|
||||
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
|
||||
@@ -112,9 +109,6 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::{Scorer, Weight};
|
||||
use std::borrow::BorrowMut;
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
@@ -122,8 +116,6 @@ pub trait Fruit: Send + downcast_rs::Downcast {}
|
||||
|
||||
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||
|
||||
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
///
|
||||
@@ -162,20 +154,6 @@ pub trait Collector: Sync {
|
||||
/// Combines the fruit associated to the collection of each segments
|
||||
/// into one fruit.
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>;
|
||||
|
||||
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
|
||||
let segment_readers = searcher.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
|
||||
let segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
Ok(segment_collector.collect_scorer(scorer.borrow_mut(), segment_reader.delete_bitset()))
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
self.merge_fruits(fruits)
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SegmentCollector` is the trait in charge of defining the
|
||||
@@ -183,7 +161,7 @@ pub trait Collector: Sync {
|
||||
///
|
||||
/// `.collect(doc, score)` will be called for every documents
|
||||
/// matching the query.
|
||||
pub trait SegmentCollector: 'static + Sized {
|
||||
pub trait SegmentCollector: 'static {
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
@@ -193,19 +171,6 @@ pub trait SegmentCollector: 'static + Sized {
|
||||
|
||||
/// Extract the fruit of the collection from the `SegmentCollector`.
|
||||
fn harvest(self) -> Self::Fruit;
|
||||
|
||||
fn collect_scorer(mut self, scorer: &mut dyn Scorer, delete_bitset: Option<&DeleteBitSet>) -> Self::Fruit {
|
||||
if let Some(delete_bitset) = delete_bitset {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
self.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| self.collect(doc, score));
|
||||
}
|
||||
self.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
pub(crate) struct TopCollector<T> {
|
||||
pub limit: usize,
|
||||
limit: usize,
|
||||
_marker: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -69,7 +69,9 @@ where
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
assert!(limit > 0, "Limit must be strictly greater than 0.");
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
TopCollector {
|
||||
limit,
|
||||
_marker: PhantomData,
|
||||
@@ -122,13 +124,13 @@ where
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
pub limit: usize,
|
||||
limit: usize,
|
||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
||||
segment_id: u32,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
||||
pub fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
TopSegmentCollector {
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
@@ -159,10 +161,6 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
|
||||
pub fn pruning_score(&self) -> Option<T> {
|
||||
self.heap.peek().map(|head| head.feature.clone())
|
||||
}
|
||||
|
||||
/// Collects a document scored by the given feature
|
||||
///
|
||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||
|
||||
@@ -8,13 +8,12 @@ use crate::collector::{
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::{DocAddress, Executor, Searcher};
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
use crate::query::{Weight, PruningScorerIfPossible};
|
||||
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
/// sorted by their score.
|
||||
@@ -67,7 +66,7 @@ struct ScorerByFastFieldReader {
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
fn score(&self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get_u64(u64::from(doc))
|
||||
}
|
||||
}
|
||||
@@ -418,42 +417,6 @@ impl Collector for TopDocs {
|
||||
true
|
||||
}
|
||||
|
||||
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
|
||||
let segment_readers = searcher.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
match weight.pruning_scorer(segment_reader, 1.0f32)? {
|
||||
PruningScorerIfPossible::NonPruning(mut scorer) => {
|
||||
let segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
let fruit =
|
||||
segment_collector.collect_scorer(scorer.as_mut(), segment_reader.delete_bitset());
|
||||
Ok(fruit)
|
||||
}
|
||||
PruningScorerIfPossible::Pruning(mut pruning_scorer) => {
|
||||
let limit = self.0.limit;
|
||||
let mut segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
for _ in 0..limit {
|
||||
if !pruning_scorer.advance() {
|
||||
return Ok(segment_collector.harvest());
|
||||
}
|
||||
segment_collector.collect(pruning_scorer.doc(), pruning_scorer.score());
|
||||
}
|
||||
let mut pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
|
||||
while pruning_scorer.advance_with_pruning(pruning_score) {
|
||||
segment_collector.0.collect(pruning_scorer.doc(), pruning_scorer.score());
|
||||
pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
}
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
self.merge_fruits(fruits)
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
child_fruits: Vec<Vec<(Score, DocAddress)>>,
|
||||
@@ -465,12 +428,6 @@ impl Collector for TopDocs {
|
||||
/// Segment Collector associated to `TopDocs`.
|
||||
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
||||
|
||||
impl TopScoreSegmentCollector {
|
||||
pub fn new(segment_id: SegmentLocalId, limit: usize) -> Self {
|
||||
TopScoreSegmentCollector(TopSegmentCollector::new(segment_id, limit))
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TopScoreSegmentCollector {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ where
|
||||
/// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html).
|
||||
pub trait ScoreSegmentTweaker<TScore>: 'static {
|
||||
/// Tweak the given `score` for the document `doc`.
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore;
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore;
|
||||
}
|
||||
|
||||
/// `ScoreTweaker` makes it possible to tweak the score
|
||||
@@ -121,9 +121,9 @@ where
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId, Score) -> TScore,
|
||||
F: 'static + Sync + Send + Fn(DocId, Score) -> TScore,
|
||||
{
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
use num_cpus;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
@@ -3,7 +3,9 @@ use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -4,6 +4,7 @@ use crate::core::Executor;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Schema;
|
||||
@@ -23,9 +24,17 @@ fn collect_segment<C: Collector>(
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<C::Fruit> {
|
||||
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
|
||||
let segment_collector =
|
||||
collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
Ok(segment_collector.collect_scorer(&mut scorer, segment_reader.delete_bitset()))
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
|
||||
@@ -11,6 +11,7 @@ use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
|
||||
use crc32fast::Hasher;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
use fs2;
|
||||
use notify;
|
||||
|
||||
use self::fs2::FileExt;
|
||||
use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{
|
||||
@@ -13,11 +20,8 @@ use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use fs2::FileExt;
|
||||
use atomicwrites;
|
||||
use memmap::Mmap;
|
||||
use notify::RawEvent;
|
||||
use notify::RecursiveMode;
|
||||
use notify::Watcher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
@@ -220,13 +224,17 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(None),
|
||||
}
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
@@ -260,11 +268,14 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
|
||||
Ok(MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
@@ -274,7 +285,7 @@ impl MmapDirectory {
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
Ok(MmapDirectory::new(tempdir_path, Some(tempdir)))
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
@@ -292,7 +303,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ use crate::directory::error::{Incompatibility, LockError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
@@ -23,6 +23,7 @@ use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
|
||||
138
src/lib.rs
138
src/lib.rs
@@ -269,6 +269,144 @@ impl DocAddress {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
mod sse2 {
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
use std::arch::x86_64::__m128i as DataType;
|
||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||
use std::arch::x86_64::_mm_load_si128 as op_load;
|
||||
// requires 128-bits alignment
|
||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
||||
|
||||
const MASK1: i32 = 78;
|
||||
const MASK2: i32 = 177;
|
||||
|
||||
/// Performs an exhaustive linear search over the
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub unsafe fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
let mut cnt = set0();
|
||||
// We work over 4 `__m128i` at a time.
|
||||
// A single `__m128i` actual contains 4 `u32`.
|
||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
||||
cnt = op_sub(cnt, sum);
|
||||
}
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
||||
_mm_cvtsi128_si32(cnt) as usize
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const MAX_DOC: u32 = 2_000_000_000u32;
|
||||
|
||||
use sse2::linear_search_sse2_128;
|
||||
use crate::postings::BlockSegmentPostings;
|
||||
use crate::schema::IndexRecordOption;
|
||||
|
||||
struct DocCursor {
|
||||
block_segment_postings: BlockSegmentPostings,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl DocCursor {
|
||||
fn new(mut block_segment_postings: BlockSegmentPostings) -> DocCursor {
|
||||
block_segment_postings.advance();
|
||||
DocCursor {
|
||||
block_segment_postings,
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
fn doc(&self,) -> DocId {
|
||||
self.block_segment_postings.doc(self.cursor)
|
||||
}
|
||||
|
||||
fn len(&self,) -> usize {
|
||||
self.block_segment_postings.doc_freq()
|
||||
}
|
||||
|
||||
fn advance(&mut self) {
|
||||
if self.cursor == 127 {
|
||||
self.block_segment_postings.advance();
|
||||
self.cursor = 0;
|
||||
} else {
|
||||
self.cursor += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_to(&mut self, target: DocId) {
|
||||
if self.block_segment_postings.skip_reader.doc() >= target {
|
||||
unsafe {
|
||||
let mut ptr = self.block_segment_postings.docs_aligned_b().0.get_unchecked(self.cursor) as *const u32;
|
||||
while *ptr < target {
|
||||
self.cursor += 1;
|
||||
ptr = ptr.offset(1);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
self.block_segment_postings.skip_to_b(target);
|
||||
let block= self.block_segment_postings.docs_aligned_b();
|
||||
self.cursor = unsafe { linear_search_sse2_128(&block, target) };
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub fn intersection(idx: &InvertedIndexReader, terms: &[Term]) -> crate::Result<u32> {
|
||||
let mut posting_lists: Vec<DocCursor> = Vec::with_capacity(terms.len());
|
||||
for term in terms {
|
||||
if let Some(mut block_postings) = idx.read_block_postings(term, IndexRecordOption::Basic) {
|
||||
posting_lists.push(DocCursor::new(block_postings ));
|
||||
} else {
|
||||
return Ok(0);
|
||||
}
|
||||
}
|
||||
posting_lists.sort_by_key(|posting_list| posting_list.len());
|
||||
Ok({ intersection_idx(&mut posting_lists[..]) })
|
||||
}
|
||||
|
||||
fn intersection_idx(posting_lists: &mut [DocCursor]) -> DocId {
|
||||
let mut candidate = posting_lists[0].doc();
|
||||
let mut i = 1;
|
||||
|
||||
let mut count = 0u32;
|
||||
|
||||
let num_posting_lists = posting_lists.len();
|
||||
loop {
|
||||
while i < num_posting_lists {
|
||||
posting_lists[i].skip_to(candidate);
|
||||
if posting_lists[i].doc() != candidate {
|
||||
candidate = posting_lists[i].doc();
|
||||
i = 0;
|
||||
continue;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
count += 1;
|
||||
if candidate == MAX_DOC {
|
||||
break;
|
||||
}
|
||||
posting_lists[0].advance();
|
||||
candidate = posting_lists[0].doc();
|
||||
i = 1;
|
||||
}
|
||||
count - 1u32
|
||||
}
|
||||
|
||||
|
||||
/// `DocAddress` contains all the necessary information
|
||||
/// to identify a document given a `Searcher` object.
|
||||
///
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
use crate::postings::Postings;
|
||||
use crate::DocId;
|
||||
|
||||
/// Inverted list with additional information about the maximum term frequency
|
||||
/// within a block, as well as globally within the list.
|
||||
pub trait BlockMaxPostings: Postings {
|
||||
/// Returns the maximum frequency in the entire list.
|
||||
fn max_term_freq(&self) -> u32;
|
||||
/// Returns the maximum frequency in the current block.
|
||||
fn block_max_term_freq(&mut self) -> u32;
|
||||
/// Returns the document with the largest frequency.
|
||||
fn max_doc(&self) -> DocId;
|
||||
/// Returns the document with the largest frequency within the current
|
||||
/// block.
|
||||
fn block_max_doc(&self) -> DocId;
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
use crate::postings::{BlockMaxPostings, Postings, SegmentPostings};
|
||||
use crate::{DocId, DocSet, SkipResult};
|
||||
|
||||
/// A wrapper over [`SegmentPostings`](./struct.SegmentPostings.html)
|
||||
/// with max block frequencies.
|
||||
pub struct BlockMaxSegmentPostings {
|
||||
postings: SegmentPostings,
|
||||
max_blocks: SegmentPostings,
|
||||
doc_with_max_term_freq: DocId,
|
||||
max_term_freq: u32,
|
||||
}
|
||||
|
||||
impl BlockMaxSegmentPostings {
|
||||
/// Constructs a new segment postings with block-max information.
|
||||
pub fn new(
|
||||
postings: SegmentPostings,
|
||||
max_blocks: SegmentPostings,
|
||||
doc_with_max_term_freq: DocId,
|
||||
max_term_freq: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
postings,
|
||||
max_blocks,
|
||||
doc_with_max_term_freq,
|
||||
max_term_freq,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BlockMaxSegmentPostings {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for BlockMaxSegmentPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
self.postings.positions_with_offset(offset, output);
|
||||
}
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.postings.positions(output);
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxPostings for BlockMaxSegmentPostings {
|
||||
fn max_term_freq(&self) -> u32 {
|
||||
self.max_term_freq
|
||||
}
|
||||
fn block_max_term_freq(&mut self) -> u32 {
|
||||
if let SkipResult::End = self.max_blocks.skip_next(self.doc()) {
|
||||
panic!("Max blocks corrupted: reached end of max block");
|
||||
}
|
||||
self.max_blocks.term_freq()
|
||||
}
|
||||
fn max_doc(&self) -> DocId {
|
||||
self.doc_with_max_term_freq
|
||||
}
|
||||
fn block_max_doc(&self) -> DocId {
|
||||
self.max_blocks.doc()
|
||||
}
|
||||
}
|
||||
@@ -25,7 +25,7 @@ mod sse2 {
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
pub fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
unsafe {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
@@ -63,6 +63,8 @@ mod sse2 {
|
||||
}
|
||||
}
|
||||
|
||||
pub use self::sse2::linear_search_sse2_128;
|
||||
|
||||
/// This `linear search` browser exhaustively through the array.
|
||||
/// but the early exit is very difficult to predict.
|
||||
///
|
||||
|
||||
@@ -1,316 +0,0 @@
|
||||
use crate::DocId;
|
||||
use tantivy_fst::Streamer;
|
||||
use crate::postings::{SkipReader, FreqReadingOption, USE_SKIP_INFO_LIMIT};
|
||||
use owned_read::OwnedRead;
|
||||
use crate::postings::compression::{BlockDecoder, COMPRESSION_BLOCK_SIZE, VIntDecoder, compressed_block_size, AlignedBuffer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::common::{VInt, BinarySerializable};
|
||||
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
postings_data.advance(skip_len);
|
||||
data.clip(skip_len);
|
||||
(Some(data), postings_data)
|
||||
} else {
|
||||
(None, data)
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
|
||||
num_vint_docs: usize,
|
||||
|
||||
remaining_data: OwnedRead,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
remaining_data: postings_data,
|
||||
skip_reader,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
pub(crate) fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
|
||||
} else {
|
||||
skip_freqs += self.skip_reader.tf_sum();
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
BlockSegmentPostingsSkipResult::Success(skip_freqs)
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Streamer<'a> for BlockSegmentPostings {
|
||||
type Item = &'a [DocId];
|
||||
|
||||
fn next(&'a mut self) -> Option<&'a [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ impl BlockEncoder {
|
||||
/// We ensure that the OutputBuffer is align on 128 bits
|
||||
/// in order to run SSE2 linear search on it.
|
||||
#[repr(align(128))]
|
||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
pub struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
@@ -67,6 +67,18 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_vint_sorted_b<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) {
|
||||
if num_els > 0 {
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset);
|
||||
}
|
||||
self.output.0[num_els..].iter_mut().for_each(|val| *val = 2_000_000_000u32);
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted(
|
||||
&mut self,
|
||||
compressed_data: &[u8],
|
||||
@@ -94,6 +106,11 @@ impl BlockDecoder {
|
||||
(&self.output, self.output_len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned_2(&self) -> &AlignedBuffer {
|
||||
&self.output
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output.0[idx]
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_max_postings;
|
||||
mod block_max_segment_postings;
|
||||
mod block_search;
|
||||
pub(crate) mod compression;
|
||||
pub mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
@@ -14,7 +12,6 @@ mod postings;
|
||||
mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod block_segment_postings;
|
||||
mod serializer;
|
||||
mod skip;
|
||||
mod stacker;
|
||||
@@ -30,11 +27,7 @@ pub use self::postings::Postings;
|
||||
pub(crate) use self::skip::SkipReader;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub use self::block_max_postings::BlockMaxPostings;
|
||||
pub use self::block_max_segment_postings::BlockMaxSegmentPostings;
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::common::HasLen;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::postings::{BlockSearcher, BlockSegmentPostings};
|
||||
use crate::postings::BlockSearcher;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::SkipReader;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use owned_read::OwnedRead;
|
||||
use std::cmp::Ordering;
|
||||
use crate::postings::block_segment_postings::BlockSegmentPostingsSkipResult;
|
||||
use tantivy_fst::Streamer;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -294,6 +299,354 @@ impl Postings for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
|
||||
num_vint_docs: usize,
|
||||
|
||||
remaining_data: OwnedRead,
|
||||
pub skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
postings_data.advance(skip_len);
|
||||
data.clip(skip_len);
|
||||
(Some(data), postings_data)
|
||||
} else {
|
||||
(None, data)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
remaining_data: postings_data,
|
||||
skip_reader,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn docs_aligned_b(&self) -> &AlignedBuffer {
|
||||
self.doc_decoder.output_aligned_2()
|
||||
}
|
||||
|
||||
pub fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
unsafe { core::arch::x86_64::_mm_prefetch(self.remaining_data.as_ref() as *mut i8, _MM_HINT_T1) };
|
||||
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
|
||||
} else {
|
||||
skip_freqs += self.skip_reader.tf_sum();
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
BlockSegmentPostingsSkipResult::Success(skip_freqs)
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
|
||||
/// Ensure we are located on the right block.
|
||||
#[inline(never)]
|
||||
pub fn skip_to_b(&mut self, target_doc: DocId) {
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes + num_bytes_to_skip);
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
return;
|
||||
} else {
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
self.doc_decoder.uncompress_vint_sorted_b(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs
|
||||
);
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
type Item = &'b [DocId];
|
||||
|
||||
fn next(&'b mut self) -> Option<&'b [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BlockSegmentPostings;
|
||||
|
||||
@@ -49,7 +49,7 @@ impl SkipSerializer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct SkipReader {
|
||||
pub struct SkipReader {
|
||||
doc: DocId,
|
||||
owned_read: OwnedRead,
|
||||
doc_num_bits: u8,
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use murmurhash32::murmurhash2;
|
||||
use murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::store;
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use downcast_rs::impl_downcast;
|
||||
use crate::query::Scorer;
|
||||
|
||||
/// A set of documents matching a query within a specific segment
|
||||
/// and having a maximum score within certain blocks.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html) and [`Scorer`](./trait.Scorer.html).
|
||||
pub trait BlockMaxScorer: downcast_rs::Downcast + DocSet + Scorer + 'static {
|
||||
/// Returns the maximum score within the current block.
|
||||
///
|
||||
/// The blocks are defined when indexing. For example, blocks can be
|
||||
/// have a specific number postings each, or can be optimized for
|
||||
/// retrieval speed. Read more in
|
||||
/// [Faster BlockMax WAND with Variable-sized Blocks][vbmw]
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
///
|
||||
/// [vbmw]: https://dl.acm.org/doi/abs/10.1145/3077136.3080780
|
||||
fn block_max_score(&mut self) -> Score;
|
||||
|
||||
/// Returns the last document in the current block.
|
||||
fn block_max_doc(&mut self) -> DocId;
|
||||
|
||||
/// Returns the maximum possible score within the entire document set.
|
||||
fn max_score(&self) -> Score;
|
||||
}
|
||||
|
||||
impl_downcast!(BlockMaxScorer);
|
||||
@@ -1,613 +0,0 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::score_combiner::ScoreCombiner;
|
||||
use crate::query::{BlockMaxScorer, Scorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::query::weight::PruningScorer;
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
struct Pivot {
|
||||
position: usize,
|
||||
first_occurrence: usize,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
///
|
||||
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
|
||||
///
|
||||
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
|
||||
fn find_pivot_position<'a, TScorer>(
|
||||
mut docsets: impl Iterator<Item = &'a TScorer>,
|
||||
lower_bound_score: Score,
|
||||
) -> Option<Pivot>
|
||||
where TScorer: BlockMaxScorer
|
||||
{
|
||||
let mut position = 0;
|
||||
let mut upper_bound = Score::default();
|
||||
while let Some(docset) = docsets.next() {
|
||||
upper_bound += docset.max_score();
|
||||
if lower_bound_score < upper_bound {
|
||||
let pivot_doc = docset.doc();
|
||||
let first_occurrence = position;
|
||||
while let Some(docset) = docsets.next() {
|
||||
if docset.doc() != pivot_doc {
|
||||
break;
|
||||
} else {
|
||||
position += 1;
|
||||
}
|
||||
}
|
||||
return Some(Pivot {
|
||||
position,
|
||||
doc: pivot_doc,
|
||||
first_occurrence,
|
||||
});
|
||||
}
|
||||
position += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
|
||||
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
|
||||
/// scores.
|
||||
fn find_next_relevant_doc<T, TScorer>(
|
||||
docsets_up_to_pivot: &mut [T],
|
||||
pivot_docset: &mut T,
|
||||
docset_after_pivot: Option<&mut T>,
|
||||
) -> DocId
|
||||
where
|
||||
T: AsMut<TScorer>,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
let pivot_docset = pivot_docset.as_mut();
|
||||
let mut next_doc = 1 + docsets_up_to_pivot
|
||||
.iter_mut()
|
||||
.map(|docset| docset.as_mut().block_max_doc())
|
||||
.chain(std::iter::once(pivot_docset.block_max_doc()))
|
||||
.min()
|
||||
.unwrap();
|
||||
if let Some(docset) = docset_after_pivot {
|
||||
let doc = docset.as_mut().doc();
|
||||
if doc < next_doc {
|
||||
next_doc = doc;
|
||||
}
|
||||
}
|
||||
if next_doc <= pivot_docset.doc() {
|
||||
pivot_docset.doc() + 1
|
||||
} else {
|
||||
next_doc
|
||||
}
|
||||
}
|
||||
|
||||
/// Sifts down the first element of the slice.
|
||||
///
|
||||
/// `docsets[1..]` are assumed sorted.
|
||||
/// This function swaps `docsets[0]` with its right
|
||||
/// neighbor successively -bubble sort style- until it reaches the first
|
||||
/// position such that `docsets` is sorted.
|
||||
fn sift_down<T, TScorer>(docsets: &mut [T])
|
||||
where
|
||||
T: AsRef<TScorer>,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
for idx in 1..docsets.len() {
|
||||
if docsets[idx].as_ref().doc() >= docsets[idx - 1].as_ref().doc() {
|
||||
return;
|
||||
}
|
||||
docsets.swap(idx, idx - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterates through the union of two or more `DocSet`s,
|
||||
/// applying [BlockMaxWand] dynamic pruning.
|
||||
///
|
||||
/// [BlockMaxWand]: https://dl.acm.org/doi/10.1145/2009916.2010048
|
||||
pub struct BlockMaxWand<TScorer, TScoreCombiner> {
|
||||
docsets: Vec<Box<TScorer>>,
|
||||
doc: DocId,
|
||||
score: Score,
|
||||
combiner: TScoreCombiner,
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> BlockMaxWand<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
fn new(
|
||||
docsets: Vec<TScorer>,
|
||||
combiner: TScoreCombiner,
|
||||
) -> BlockMaxWand<TScorer, TScoreCombiner> {
|
||||
let mut non_empty_docsets: Vec<_> = docsets
|
||||
.into_iter()
|
||||
.flat_map(|mut docset| {
|
||||
if docset.advance() {
|
||||
Some(Box::new(docset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
non_empty_docsets.sort_by_key(Box::<TScorer>::doc);
|
||||
BlockMaxWand {
|
||||
docsets: non_empty_docsets,
|
||||
combiner,
|
||||
doc: 0u32,
|
||||
score: 0f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
|
||||
find_pivot_position(
|
||||
self.docsets.iter().map(|docset| docset.as_ref()),
|
||||
lower_bound_score)
|
||||
}
|
||||
|
||||
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
|
||||
let block_upper_bound: Score = self.docsets[..=pivot.position]
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_score())
|
||||
.sum();
|
||||
if block_upper_bound > lower_bound_score {
|
||||
if pivot.doc == self.docsets[0].doc() {
|
||||
// Since self.docsets is sorted by their current doc, in this branch, all
|
||||
// docsets in [0..=pivot] are positioned on pivot.doc.
|
||||
//
|
||||
// Lets compute the actual score for this doc.
|
||||
//
|
||||
// NOTE(elshize): One additional check needs to be done to improve performance:
|
||||
// update block-wise bound while accumulating score with the actual score,
|
||||
// and check each time if still above threshold.
|
||||
self.combiner.clear();
|
||||
for idx in (0..=pivot.position).rev() {
|
||||
self.combiner.update(self.docsets[idx].as_mut());
|
||||
if !self.docsets[idx].advance() {
|
||||
self.docsets.swap_remove(idx);
|
||||
}
|
||||
}
|
||||
self.score = self.combiner.score();
|
||||
self.doc = pivot.doc;
|
||||
self.docsets.sort_by_key(Box::<TScorer>::doc);
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
// The substraction does not underflow because otherwise we would go to the other
|
||||
// branch.
|
||||
//
|
||||
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
|
||||
let advanced_idx = pivot.first_occurrence - 1;
|
||||
if !self.docsets[advanced_idx].advance() {
|
||||
self.docsets.swap_remove(advanced_idx);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[advanced_idx..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
|
||||
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
|
||||
// NOTE(elshize): It might be more efficient to advance the list with the higher
|
||||
// max score, but let's advance the first one for now for simplicity.
|
||||
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
|
||||
self.docsets.swap_remove(0);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> PruningScorer
|
||||
for BlockMaxWand<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: Scorer + BlockMaxScorer,
|
||||
{
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
self.score
|
||||
}
|
||||
|
||||
fn advance_with_pruning(&mut self, lower_bound_score: f32) -> bool {
|
||||
while let Some(pivot) = self.find_pivot_position(lower_bound_score) {
|
||||
match self.advance_with_pivot(pivot, lower_bound_score) {
|
||||
SkipResult::End => { return false },
|
||||
SkipResult::Reached=> { return true; }
|
||||
SkipResult::OverStep => {}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::HasLen;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::score_combiner::SumCombiner;
|
||||
use crate::query::Union;
|
||||
use crate::query::{BlockMaxScorer, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
use float_cmp::approx_eq;
|
||||
use proptest::strategy::Strategy;
|
||||
use std::cmp::Ordering;
|
||||
use std::num::Wrapping;
|
||||
use crate::collector::{SegmentCollector, TopScoreSegmentCollector};
|
||||
|
||||
/*
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VecDocSet {
|
||||
postings: Vec<(DocId, Score)>,
|
||||
cursor: Wrapping<usize>,
|
||||
block_max_scores: Vec<(DocId, Score)>,
|
||||
max_score: Score,
|
||||
block_size: usize,
|
||||
}
|
||||
|
||||
impl VecDocSet {
|
||||
fn new(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
|
||||
let block_max_scores: Vec<(DocId, f32)> = postings
|
||||
.chunks(block_size)
|
||||
.into_iter()
|
||||
.map(|block| {
|
||||
(
|
||||
block.iter().last().unwrap().0,
|
||||
block
|
||||
.iter()
|
||||
.map(|(_, s)| *s)
|
||||
.fold(-f32::INFINITY, |left, right| left.max(right))
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let max_score = block_max_scores
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|(_, s)| s)
|
||||
.fold(-f32::INFINITY, |left, right| left.max(right));
|
||||
VecDocSet {
|
||||
postings,
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores,
|
||||
max_score,
|
||||
block_size,
|
||||
}
|
||||
}
|
||||
/// Constructs a new set and advances it.
|
||||
fn started(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
|
||||
let mut docset = VecDocSet::new(postings, block_size);
|
||||
docset.advance();
|
||||
docset
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for VecDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.postings.len() > self.cursor.0
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings[self.cursor.0].0
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for VecDocSet {
|
||||
fn len(&self) -> usize {
|
||||
self.postings.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxScorer for VecDocSet {
|
||||
fn max_score(&self) -> Score {
|
||||
self.max_score
|
||||
}
|
||||
fn block_max_score(&mut self) -> Score {
|
||||
self.block_max_scores[self.cursor.0 / self.block_size].1
|
||||
}
|
||||
fn block_max_doc(&mut self) -> DocId {
|
||||
self.block_max_scores[self.cursor.0 / self.block_size].0
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for VecDocSet {
|
||||
fn score(&mut self) -> Score {
|
||||
self.postings[self.cursor.0].1
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ComparableDoc<T, D> {
|
||||
feature: T,
|
||||
doc: D,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// Reversed to make BinaryHeap work as a min-heap
|
||||
let by_feature = other
|
||||
.feature
|
||||
.partial_cmp(&self.feature)
|
||||
.unwrap_or(Ordering::Equal);
|
||||
|
||||
let lazy_by_doc_address =
|
||||
|| self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||
|
||||
// In case of a tie on the feature, we sort by ascending
|
||||
// `DocAddress` in order to ensure a stable sorting of the
|
||||
// documents.
|
||||
by_feature.then_with(lazy_by_doc_address)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
fn union_vs_bmw(posting_lists: Vec<VecDocSet>) {
|
||||
let mut union = Union::<VecDocSet, SumCombiner>::from(posting_lists.clone());
|
||||
let mut top_union = TopScoreSegmentCollector::new(0, 10);
|
||||
while union.advance() {
|
||||
top_union.collect(union.doc(), union.score());
|
||||
}
|
||||
let top_bmw = TopScoreSegmentCollector::new(0, 10 );
|
||||
let mut bmw = BlockMaxWand::new(posting_lists, SumCombiner::default());
|
||||
let top_docs_bnw = top_bmw.collect_scorer(&mut bmw, None);
|
||||
for ((expected_score, expected_doc), (actual_score, actual_doc)) in
|
||||
top_union.harvest().into_iter().zip( top_docs_bnw )
|
||||
{
|
||||
assert!(approx_eq!(
|
||||
f32,
|
||||
expected_score,
|
||||
actual_score,
|
||||
epsilon = 0.0001
|
||||
));
|
||||
assert_eq!(expected_doc, actual_doc);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bmw_0() {
|
||||
union_vs_bmw(vec![
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(0, 1.0),
|
||||
(23, 1.0),
|
||||
(28, 1.0),
|
||||
(56, 1.0),
|
||||
(59, 1.0),
|
||||
(66, 1.0),
|
||||
(93, 1.0),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(93, 1.0)],
|
||||
max_score: 1.0,
|
||||
block_size: 16,
|
||||
},
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(2, 1.6549665),
|
||||
(43, 2.6958032),
|
||||
(53, 3.5309567),
|
||||
(71, 2.7688136),
|
||||
(87, 3.4279852),
|
||||
(96, 3.9028034),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(96, 3.9028034)],
|
||||
max_score: 3.9028034,
|
||||
block_size: 16,
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bmw_1() {
|
||||
union_vs_bmw(vec![
|
||||
VecDocSet {
|
||||
postings: vec![(73, 1.0), (82, 1.0)],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(82, 1.0)],
|
||||
max_score: 1.0,
|
||||
block_size: 16,
|
||||
},
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(21, 3.582513),
|
||||
(23, 1.6928024),
|
||||
(27, 3.887647),
|
||||
(42, 1.5469292),
|
||||
(61, 1.7317574),
|
||||
(62, 1.2968783),
|
||||
(82, 2.4040694),
|
||||
(85, 3.1487892),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(85, 3.887647)],
|
||||
max_score: 3.887647,
|
||||
block_size: 16,
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_union_vs_bmw(postings in proptest::collection::vec(
|
||||
proptest::collection::vec(0_u32..100, 1..10)
|
||||
.prop_flat_map(|v| {
|
||||
let scores = proptest::collection::vec(1_f32..4_f32, v.len()..=v.len());
|
||||
scores.prop_map(move |s| {
|
||||
let mut postings: Vec<_> = v.iter().copied().zip(s.iter().copied()).collect();
|
||||
postings.sort_by_key(|p| p.0);
|
||||
postings.dedup_by_key(|p| p.0);
|
||||
VecDocSet::new(postings, 16)
|
||||
})
|
||||
}),
|
||||
2..5)
|
||||
) {
|
||||
union_vs_bmw(postings);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_pivot_position() {
|
||||
let postings = vec![
|
||||
VecDocSet::started(vec![(0, 2.0)], 1),
|
||||
VecDocSet::started(vec![(1, 3.0)], 1),
|
||||
VecDocSet::started(vec![(2, 4.0)], 1),
|
||||
VecDocSet::started(vec![(3, 5.0)], 1),
|
||||
VecDocSet::started(vec![(3, 6.0)], 1),
|
||||
];
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 2.0f32),
|
||||
Some(Pivot {
|
||||
position: 1,
|
||||
doc: 1,
|
||||
first_occurrence: 1,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 5.0f32),
|
||||
Some(Pivot {
|
||||
position: 2,
|
||||
doc: 2,
|
||||
first_occurrence: 2,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 9.0f32),
|
||||
Some(Pivot {
|
||||
position: 4,
|
||||
doc: 3,
|
||||
first_occurrence: 3,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 20.0f32),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_before_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (6, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_prefix_smaller_than_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_after_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_already_sifted() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 2, 5, 6]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_sift_one_down() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(7, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 5, 6, 7]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_to_bottom() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(7, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 5, 6, 7]
|
||||
);
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
/*! Query Module */
|
||||
/*!
|
||||
Query
|
||||
*/
|
||||
|
||||
mod all_query;
|
||||
mod automaton_weight;
|
||||
mod bitset;
|
||||
mod block_max_scorer;
|
||||
mod block_max_wand;
|
||||
mod bm25;
|
||||
mod boolean_query;
|
||||
mod boost_query;
|
||||
@@ -24,6 +24,7 @@ mod term_query;
|
||||
mod union;
|
||||
mod weight;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod vec_docset;
|
||||
|
||||
@@ -37,15 +38,13 @@ pub use self::vec_docset::VecDocSet;
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::automaton_weight::AutomatonWeight;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::block_max_scorer::BlockMaxScorer;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boost_query::BoostQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::fuzzy_query::DFAWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub(crate) use self::fuzzy_query::DFAWrapper;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::Query;
|
||||
@@ -59,8 +58,6 @@ pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
pub use tantivy_query_grammar::Occur;
|
||||
pub use self::weight::PruningScorerIfPossible;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
use super::Weight;
|
||||
use crate::core::searcher::Searcher;
|
||||
use crate::query::Explanation;
|
||||
use crate::DocAddress;
|
||||
use crate::{DocAddress, Score};
|
||||
use crate::Term;
|
||||
use downcast_rs::impl_downcast;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
use crate::collector::{TopDocs, Collector, SegmentCollector, Count};
|
||||
|
||||
pub struct TopKResult {
|
||||
pub count: u64,
|
||||
docs: Vec<(Score, DocAddress)>
|
||||
}
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -56,6 +62,27 @@ pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
|
||||
weight.explain(reader, doc_address.doc())
|
||||
}
|
||||
|
||||
fn top_k(&self, searcher: &Searcher, num_hits: usize) -> crate::Result<TopKResult> {
|
||||
let top_docs = TopDocs::with_limit(num_hits);
|
||||
let collector = (Count, top_docs);
|
||||
let weight = self.weight(searcher, false)?;
|
||||
let mut count = 0u64;
|
||||
let mut result = 0;
|
||||
let mut child_fruits = Vec::with_capacity(searcher.segment_readers().len());
|
||||
for (segment_ord, reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut child_top_k = collector.for_segment(segment_ord as u32, reader)?;
|
||||
let mut scorer = weight.scorer(reader, 1.0f32)?;
|
||||
// handle deletes
|
||||
scorer.top_k(&mut child_top_k);
|
||||
child_fruits.push(child_top_k.harvest());
|
||||
}
|
||||
let (count, docs) = collector.merge_fruits(child_fruits)?;
|
||||
Ok(TopKResult {
|
||||
count: count as u64,
|
||||
docs
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the number of documents matching the query.
|
||||
fn count(&self, searcher: &Searcher) -> crate::Result<usize> {
|
||||
let weight = self.weight(searcher, false)?;
|
||||
|
||||
@@ -4,6 +4,8 @@ use crate::DocId;
|
||||
use crate::Score;
|
||||
use downcast_rs::impl_downcast;
|
||||
use std::ops::DerefMut;
|
||||
use crate::collector::{SegmentCollector, Collector, TopDocs, Count};
|
||||
|
||||
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
@@ -21,6 +23,12 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
|
||||
callback(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
|
||||
fn top_k(&mut self, collector: &mut <(Count, TopDocs) as Collector>::Child) {
|
||||
while self.advance() {
|
||||
collector.collect(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl_downcast!(Scorer);
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::{BlockMaxPostings, BlockMaxSegmentPostings};
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::BlockMaxScorer;
|
||||
|
||||
pub struct BlockMaxTermScorer {
|
||||
postings: BlockMaxSegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
}
|
||||
|
||||
impl BlockMaxTermScorer {
|
||||
pub fn new(
|
||||
postings: BlockMaxSegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
) -> Self {
|
||||
Self {
|
||||
postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxTermScorer {
|
||||
fn _score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
|
||||
self.similarity_weight.score(fieldnorm_id, term_freq)
|
||||
}
|
||||
|
||||
pub fn term_freq(&self) -> u32 {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
|
||||
pub fn fieldnorm_id(&self) -> u8 {
|
||||
self.fieldnorm_reader.fieldnorm_id(self.doc())
|
||||
}
|
||||
|
||||
pub fn explain(&self) -> Explanation {
|
||||
let fieldnorm_id = self.fieldnorm_id();
|
||||
let term_freq = self.term_freq();
|
||||
self.similarity_weight.explain(fieldnorm_id, term_freq)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BlockMaxTermScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for BlockMaxTermScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader.fieldnorm_id(self.doc()),
|
||||
self.postings.term_freq(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxScorer for BlockMaxTermScorer {
|
||||
fn block_max_score(&mut self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader
|
||||
.fieldnorm_id(self.postings.block_max_doc()),
|
||||
self.postings.term_freq(),
|
||||
)
|
||||
}
|
||||
|
||||
fn block_max_doc(&mut self) -> DocId {
|
||||
self.postings.block_max_doc()
|
||||
}
|
||||
|
||||
fn max_score(&self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader.fieldnorm_id(self.postings.max_doc()),
|
||||
self.postings.max_term_freq(),
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,7 @@
|
||||
mod block_max_term_scorer;
|
||||
mod term_query;
|
||||
mod term_scorer;
|
||||
mod term_weight;
|
||||
|
||||
pub use self::block_max_term_scorer::BlockMaxTermScorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::term_scorer::TermScorer;
|
||||
pub use self::term_weight::TermWeight;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::common::TinySet;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::{Scorer, BlockMaxScorer};
|
||||
use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use std::cmp::Ordering;
|
||||
@@ -9,99 +9,6 @@ use std::cmp::Ordering;
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
struct Pivot {
|
||||
position: usize,
|
||||
first_occurrence: usize,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
///
|
||||
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
|
||||
///
|
||||
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
|
||||
fn find_pivot_position<'a, TScorer>(
|
||||
mut docsets: impl Iterator<Item = &'a TScorer>,
|
||||
lower_bound_score: Score,
|
||||
) -> Option<Pivot>
|
||||
where TScorer: BlockMaxScorer
|
||||
{
|
||||
let mut position = 0;
|
||||
let mut upper_bound = Score::default();
|
||||
while let Some(docset) = docsets.next() {
|
||||
upper_bound += docset.max_score();
|
||||
if lower_bound_score < upper_bound {
|
||||
let pivot_doc = docset.doc();
|
||||
let first_occurrence = position;
|
||||
while let Some(docset) = docsets.next() {
|
||||
if docset.doc() != pivot_doc {
|
||||
break;
|
||||
} else {
|
||||
position += 1;
|
||||
}
|
||||
}
|
||||
return Some(Pivot {
|
||||
position,
|
||||
doc: pivot_doc,
|
||||
first_occurrence,
|
||||
});
|
||||
}
|
||||
position += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Sifts down the first element of the slice.
|
||||
///
|
||||
/// `docsets[1..]` are assumed sorted.
|
||||
/// This function swaps `docsets[0]` with its right
|
||||
/// neighbor successively -bubble sort style- until it reaches the first
|
||||
/// position such that `docsets` is sorted.
|
||||
fn sift_down<TScorer>(docsets: &mut [TScorer])
|
||||
where
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
for idx in 1..docsets.len() {
|
||||
if docsets[idx].doc() >= docsets[idx - 1].doc() {
|
||||
return;
|
||||
}
|
||||
docsets.swap(idx, idx - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
|
||||
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
|
||||
/// scores.
|
||||
fn find_next_relevant_doc<TScorer>(
|
||||
docsets_up_to_pivot: &mut [TScorer],
|
||||
pivot_docset: &mut TScorer,
|
||||
docset_after_pivot: Option<&mut TScorer>,
|
||||
) -> DocId
|
||||
where
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
let mut next_doc = 1 + docsets_up_to_pivot
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_doc())
|
||||
.chain(std::iter::once(pivot_docset.block_max_doc()))
|
||||
.min()
|
||||
.unwrap();
|
||||
if let Some(docset) = docset_after_pivot {
|
||||
let doc = docset.doc();
|
||||
if doc < next_doc {
|
||||
next_doc = doc;
|
||||
}
|
||||
}
|
||||
if next_doc <= pivot_docset.doc() {
|
||||
pivot_docset.doc() + 1
|
||||
} else {
|
||||
next_doc
|
||||
}
|
||||
}
|
||||
|
||||
// `drain_filter` is not stable yet.
|
||||
// This function is similar except that it does is not unstable, and
|
||||
// it does not keep the original vector ordering.
|
||||
@@ -132,7 +39,6 @@ pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
score: Score,
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -220,76 +126,6 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer: BlockMaxScorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
|
||||
let block_upper_bound: Score = self.docsets[..=pivot.position]
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_score())
|
||||
.sum();
|
||||
if block_upper_bound > lower_bound_score {
|
||||
if pivot.doc == self.docsets[0].doc() {
|
||||
// Since self.docsets is sorted by their current doc, in this branch, all
|
||||
// docsets in [0..=pivot] are positioned on pivot.doc.
|
||||
//
|
||||
// Lets compute the actual score for this doc.
|
||||
//
|
||||
// NOTE(elshize): One additional check needs to be done to improve performance:
|
||||
// update block-wise bound while accumulating score with the actual score,
|
||||
// and check each time if still above threshold.
|
||||
let mut combiner = TScoreCombiner::default();
|
||||
for idx in (0..=pivot.position).rev() {
|
||||
combiner.update(&mut self.docsets[idx]);
|
||||
if !self.docsets[idx].advance() {
|
||||
self.docsets.swap_remove(idx);
|
||||
}
|
||||
}
|
||||
self.score = combiner.score();
|
||||
self.doc = pivot.doc;
|
||||
self.docsets.sort_by_key(TScorer::doc);
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
// The substraction does not underflow because otherwise we would go to the other
|
||||
// branch.
|
||||
//
|
||||
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
|
||||
let advanced_idx = pivot.first_occurrence - 1;
|
||||
if !self.docsets[advanced_idx].advance() {
|
||||
self.docsets.swap_remove(advanced_idx);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[advanced_idx..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
|
||||
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
|
||||
// NOTE(elshize): It might be more efficient to advance the list with the higher
|
||||
// max score, but let's advance the first one for now for simplicity.
|
||||
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
|
||||
self.docsets.swap_remove(0);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
|
||||
find_pivot_position(
|
||||
self.docsets.iter().map(|docset| docset),
|
||||
lower_bound_score)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
|
||||
|
||||
@@ -1,26 +1,7 @@
|
||||
use super::Scorer;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Explanation;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
pub trait PruningScorer {
|
||||
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
fn score(&self) -> Score;
|
||||
|
||||
/// Advance to the next document that has a score strictly greater than
|
||||
/// `lower_bound_score`.
|
||||
fn advance_with_pruning(&mut self, score_lower_bound: f32) -> bool;
|
||||
fn advance(&mut self) -> bool {
|
||||
self.advance_with_pruning(std::f32::NEG_INFINITY)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum PruningScorerIfPossible {
|
||||
Pruning(Box<dyn PruningScorer>),
|
||||
NonPruning(Box<dyn Scorer>)
|
||||
}
|
||||
use crate::DocId;
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
@@ -34,11 +15,6 @@ pub trait Weight: Send + Sync + 'static {
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<Box<dyn Scorer>>;
|
||||
|
||||
fn pruning_scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<PruningScorerIfPossible> {
|
||||
let scorer = self.scorer(reader, boost)?;
|
||||
Ok(PruningScorerIfPossible::NonPruning(Box::new(scorer)))
|
||||
}
|
||||
|
||||
/// Returns an `Explanation` for the given document.
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::directory::META_LOCK;
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
@@ -61,6 +60,7 @@ impl IndexReaderBuilder {
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
/// TODO(pmasurel) Use the `TryInto` trait once it is available in stable.
|
||||
pub fn try_into(self) -> crate::Result<IndexReader> {
|
||||
let inner_reader = InnerIndexReader {
|
||||
index: self.index,
|
||||
@@ -113,14 +113,6 @@ impl IndexReaderBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
type Error = crate::TantivyError;
|
||||
|
||||
fn try_into(self) -> crate::Result<IndexReader> {
|
||||
IndexReaderBuilder::try_into(self)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerIndexReader {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
|
||||
@@ -3,6 +3,7 @@ use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use serde;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::mem;
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use serde;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Value;
|
||||
use std::io::{self, Read, Write};
|
||||
use serde;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, serde::Serialize, serde::Deserialize)]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use snap;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
|
||||
@@ -434,8 +434,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() {
|
||||
use crate::query::DFAWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
use crate::query::DFAWrapper;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
"San Marino",
|
||||
|
||||
@@ -7,6 +7,7 @@ use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_fst;
|
||||
use tantivy_fst::raw::Fst;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user