mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-03-19 11:40:42 +00:00
Compare commits
21 Commits
postings-w
...
composite-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e6fdba1d21 | ||
|
|
ad5eb250fe | ||
|
|
d02559a4d1 | ||
|
|
1922abaf33 | ||
|
|
d0c5ffb0aa | ||
|
|
18fedd9384 | ||
|
|
2098fca47f | ||
|
|
1251b40c93 | ||
|
|
09a49b872c | ||
|
|
b9ace002ce | ||
|
|
2dc4e9ef78 | ||
|
|
aeea65f61d | ||
|
|
4211d5a1ed | ||
|
|
d50c7a1daf | ||
|
|
cf760fd5b6 | ||
|
|
df04c7d8f1 | ||
|
|
68626bf3a1 | ||
|
|
51f340f83d | ||
|
|
7eca33143e | ||
|
|
698f073f88 | ||
|
|
cdd24b7ee5 |
@@ -47,7 +47,7 @@ rustc-hash = "2.0.0"
|
||||
thiserror = "2.0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||
time = { version = "0.3.47", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.16.3"
|
||||
@@ -64,8 +64,8 @@ query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tanti
|
||||
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
|
||||
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||
sketches-ddsketch = { git = "https://github.com/quickwit-oss/rust-sketches-ddsketch.git", rev = "555caf1", features = ["use_serde"] }
|
||||
datasketches = "0.2.0"
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
futures-channel = { version = "0.3.28", optional = true }
|
||||
fnv = "1.0.7"
|
||||
@@ -86,7 +86,7 @@ futures = "0.3.21"
|
||||
paste = "1.0.11"
|
||||
more-asserts = "0.3.1"
|
||||
rand_distr = "0.5"
|
||||
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
|
||||
time = { version = "0.3.47", features = ["serde-well-known", "macros"] }
|
||||
postcard = { version = "1.0.4", features = [
|
||||
"use-std",
|
||||
], default-features = false }
|
||||
@@ -201,4 +201,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "regex_all_terms"
|
||||
harness = false
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use tantivy::aggregation::agg_req::Aggregations;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::{AllQuery, TermQuery};
|
||||
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use tantivy::{doc, Index, Term};
|
||||
use tantivy::{doc, DateTime, Index, Term};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
@@ -70,6 +70,12 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
|
||||
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||
|
||||
register!(group, composite_term_many_page_1000);
|
||||
register!(group, composite_term_many_page_1000_with_avg_sub_agg);
|
||||
register!(group, composite_term_few);
|
||||
register!(group, composite_histogram);
|
||||
register!(group, composite_histogram_calendar);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, terms_status_with_cardinality_agg);
|
||||
|
||||
@@ -314,6 +320,75 @@ fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn composite_term_few(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_ctf": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{ "text_few_terms": { "terms": { "field": "text_few_terms" } } }
|
||||
],
|
||||
"size": 1000
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn composite_term_many_page_1000(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_ctmp1000": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{ "text_many_terms": { "terms": { "field": "text_many_terms" } } }
|
||||
],
|
||||
"size": 1000
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn composite_term_many_page_1000_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_ctmp1000wasa": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{ "text_many_terms": { "terms": { "field": "text_many_terms" } } }
|
||||
],
|
||||
"size": 1000,
|
||||
},
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn composite_histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_ch": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{ "f64_histogram": { "histogram": { "field": "score_f64", "interval": 1 } } }
|
||||
],
|
||||
"size": 1000
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn composite_histogram_calendar(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_chc": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{ "time_histogram": { "date_histogram": { "field": "timestamp", "calendar_interval": "month" } } }
|
||||
],
|
||||
"size": 1000
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
||||
let collector = get_collector(agg_req);
|
||||
@@ -496,6 +571,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
let text_field_all_unique_terms =
|
||||
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
|
||||
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let text_field_few_terms_status =
|
||||
schema_builder.add_text_field("text_few_terms_status", STRING | FAST);
|
||||
let text_field_1000_terms_zipf =
|
||||
@@ -504,6 +580,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("timestamp", FAST);
|
||||
// use tmp dir
|
||||
let index = if reuse_index {
|
||||
Index::create_in_dir("agg_bench", schema_builder.build())?
|
||||
@@ -523,6 +600,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
let log_level_distribution =
|
||||
WeightedIndex::new(status_field_data.iter().map(|item| item.1)).unwrap();
|
||||
|
||||
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
||||
|
||||
let many_terms_data = (0..150_000)
|
||||
@@ -558,6 +636,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
text_field_all_unique_terms => "coolo",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_many_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
text_field_few_terms => "cool",
|
||||
text_field_few_terms_status => log_level_sample_a,
|
||||
text_field_few_terms_status => log_level_sample_b,
|
||||
text_field_1000_terms_zipf => term_1000_a.as_str(),
|
||||
@@ -588,11 +668,13 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
json_field => json,
|
||||
text_field_all_unique_terms => format!("unique_term_{}", rng.random::<u64>()),
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0,
|
||||
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => val as i64,
|
||||
date_field => DateTime::from_timestamp_millis((val * 1_000_000.) as i64),
|
||||
))?;
|
||||
if cardinality == Cardinality::OptionalSparse {
|
||||
for _ in 0..20 {
|
||||
|
||||
@@ -17,6 +17,7 @@ use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::collector::{Count, DocSetCollector};
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::document::TantivyDocument;
|
||||
use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
|
||||
use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};
|
||||
|
||||
@@ -44,7 +45,7 @@ fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
|
||||
match distribution {
|
||||
"dense_random" => {
|
||||
for _doc_id in 0..num_docs {
|
||||
let suffix = rng.random_range(0u64..1000u64);
|
||||
let suffix = rng.gen_range(0u64..1000u64);
|
||||
let str_val = format!("str_{:03}", suffix);
|
||||
|
||||
writer
|
||||
@@ -70,7 +71,7 @@ fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
|
||||
}
|
||||
"sparse_random" => {
|
||||
for _doc_id in 0..num_docs {
|
||||
let suffix = rng.random_range(0u64..1000000u64);
|
||||
let suffix = rng.gen_range(0u64..1000000u64);
|
||||
let str_val = format!("str_{:07}", suffix);
|
||||
|
||||
writer
|
||||
@@ -405,7 +406,7 @@ impl FetchAllStringsFromDocTask {
|
||||
|
||||
for doc_address in docs {
|
||||
// Get the document from the doc store (row store access)
|
||||
if let Ok(doc) = self.searcher.doc(doc_address) {
|
||||
if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
|
||||
// Extract string values from the stored field
|
||||
if let Some(field_value) = doc.get_first(str_stored_field) {
|
||||
if let Some(text) = field_value.as_value().as_str() {
|
||||
|
||||
@@ -59,7 +59,7 @@ pub struct RowAddr {
|
||||
pub row_id: RowId,
|
||||
}
|
||||
|
||||
pub use sstable::Dictionary;
|
||||
pub use sstable::{Dictionary, TermOrdHit};
|
||||
pub type Streamer<'a> = sstable::Streamer<'a, VoidSSTable>;
|
||||
|
||||
pub use common::DateTime;
|
||||
|
||||
@@ -15,11 +15,10 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version= "0.9", path="../ownedbytes" }
|
||||
async-trait = "0.1"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
time = { version = "0.3.47", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.14.0"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.9"
|
||||
|
||||
|
||||
@@ -178,11 +178,13 @@ impl TinySet {
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: u64,
|
||||
max_value: u32,
|
||||
}
|
||||
impl std::fmt::Debug for BitSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("BitSet")
|
||||
.field("len", &self.len)
|
||||
.field("max_value", &self.max_value)
|
||||
.finish()
|
||||
}
|
||||
@@ -210,6 +212,7 @@ impl BitSet {
|
||||
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
@@ -227,6 +230,7 @@ impl BitSet {
|
||||
}
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
@@ -245,19 +249,17 @@ impl BitSet {
|
||||
|
||||
/// Intersect with tinysets
|
||||
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
|
||||
self.len = 0;
|
||||
for (left, right) in self.tinysets.iter_mut().zip(other) {
|
||||
*left = left.intersect(right);
|
||||
self.len += left.len() as u64;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.tinysets
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|tinyset| tinyset.len())
|
||||
.sum::<u32>() as usize
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
@@ -266,7 +268,7 @@ impl BitSet {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.tinysets[higher as usize].insert_mut(lower);
|
||||
self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
@@ -275,7 +277,7 @@ impl BitSet {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.tinysets[higher as usize].remove_mut(lower);
|
||||
self.len -= u64::from(self.tinysets[higher as usize].remove_mut(lower));
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
@@ -297,9 +299,6 @@ impl BitSet {
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
/// Returns the maximum number of elements in the bitset.
|
||||
///
|
||||
/// Warning: The largest element the bitset can contain is `max_value - 1`.
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
|
||||
@@ -70,7 +70,7 @@ impl Collector for StatsCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
|
||||
Ok(StatsSegmentCollector {
|
||||
|
||||
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
for (_score, doc_address) in count_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
assert!(retrieved_doc
|
||||
.get_first(occurred_at)
|
||||
.unwrap()
|
||||
|
||||
@@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
// Call TopDocs with a custom tweak score
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
@@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.doc::<TantivyDocument>(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.and_then(|v| v.as_str().map(|el| el.to_string()))
|
||||
|
||||
@@ -91,10 +91,46 @@ fn main() -> tantivy::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Some other powerful operations (especially `.seek`) may be useful to consume these
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// Some other powerful operations (especially `.skip_to`) may be useful to consume these
|
||||
// posting lists rapidly.
|
||||
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
|
||||
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
|
||||
|
||||
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
||||
// you can get better performance by accessing directly the blocks of doc ids.
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
// - the inverted lists associated with each terms and their positions
|
||||
let inverted_index = segment_reader.inverted_index(title)?;
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
|
||||
{
|
||||
loop {
|
||||
let docs = block_segment_postings.docs();
|
||||
if docs.is_empty() {
|
||||
break;
|
||||
}
|
||||
// Once again these docs MAY contains deleted documents as well.
|
||||
let docs = block_segment_postings.docs();
|
||||
// Prints `Docs [0, 2].`
|
||||
println!("Docs {docs:?}");
|
||||
block_segment_postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ fn main() -> Result<()> {
|
||||
let mut titles = top_docs
|
||||
.into_iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
let title = doc
|
||||
.get_first(title)
|
||||
.and_then(|v| v.as_str())
|
||||
|
||||
@@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {score}:");
|
||||
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
|
||||
|
||||
@@ -43,7 +43,7 @@ impl DynamicPriceColumn {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
||||
}
|
||||
@@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let score_by_price = move |segment_reader: &dyn SegmentReader| {
|
||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
||||
let price = price_dynamic_column
|
||||
.price_for_segment(segment_reader)
|
||||
.unwrap();
|
||||
|
||||
@@ -57,7 +57,7 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
|
||||
|
||||
/// Get fast field reader or empty as default.
|
||||
pub(crate) fn get_ff_reader(
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
) -> crate::Result<(columnar::Column<u64>, ColumnType)> {
|
||||
@@ -74,7 +74,7 @@ pub(crate) fn get_ff_reader(
|
||||
}
|
||||
|
||||
pub(crate) fn get_dynamic_columns(
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
) -> crate::Result<Vec<columnar::DynamicColumn>> {
|
||||
let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?;
|
||||
@@ -90,7 +90,7 @@ pub(crate) fn get_dynamic_columns(
|
||||
///
|
||||
/// Is guaranteed to return at least one column.
|
||||
pub(crate) fn get_all_ff_reader_or_empty(
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
fallback_type: ColumnType,
|
||||
|
||||
@@ -10,9 +10,10 @@ use crate::aggregation::accessor_helpers::{
|
||||
};
|
||||
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use crate::aggregation::bucket::{
|
||||
build_segment_filter_collector, build_segment_range_collector, FilterAggReqData,
|
||||
HistogramAggReqData, HistogramBounds, IncludeExcludeParam, MissingTermAggReqData,
|
||||
RangeAggReqData, SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
||||
build_segment_filter_collector, build_segment_range_collector, CompositeAggReqData,
|
||||
CompositeAggregation, CompositeSourceAccessors, FilterAggReqData, HistogramAggReqData,
|
||||
HistogramBounds, IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData,
|
||||
SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
||||
TermsAggregationInternal,
|
||||
};
|
||||
use crate::aggregation::metric::{
|
||||
@@ -73,6 +74,12 @@ impl AggregationsSegmentCtx {
|
||||
self.per_request.filter_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.filter_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_composite_req_data(&mut self, data: CompositeAggReqData) -> usize {
|
||||
self.per_request
|
||||
.composite_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.composite_req_data.len() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
|
||||
@@ -108,6 +115,12 @@ impl AggregationsSegmentCtx {
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_composite_req_data(&self, idx: usize) -> &CompositeAggReqData {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- mutable getters ----------
|
||||
|
||||
@@ -181,6 +194,25 @@ impl AggregationsSegmentCtx {
|
||||
debug_assert!(self.per_request.filter_req_data[idx].is_none());
|
||||
self.per_request.filter_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the Composite request at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn take_composite_req_data(&mut self, idx: usize) -> Box<CompositeAggReqData> {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.take()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Composite request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_composite_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<CompositeAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.composite_req_data[idx].is_none());
|
||||
self.per_request.composite_req_data[idx] = Some(value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each type of aggregation has its own request data struct. This struct holds
|
||||
@@ -208,6 +240,8 @@ pub struct PerRequestAggSegCtx {
|
||||
pub top_hits_req_data: Vec<TopHitsAggReqData>,
|
||||
/// MissingTermAggReqData contains the request data for a missing term aggregation.
|
||||
pub missing_term_req_data: Vec<MissingTermAggReqData>,
|
||||
/// CompositeAggReqData contains the request data for a composite aggregation.
|
||||
pub composite_req_data: Vec<Option<Box<CompositeAggReqData>>>,
|
||||
|
||||
/// Request tree used to build collectors.
|
||||
pub agg_tree: Vec<AggRefNode>,
|
||||
@@ -255,6 +289,11 @@ impl PerRequestAggSegCtx {
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.composite_req_data
|
||||
.iter()
|
||||
.map(|b| b.as_ref().map(|d| d.get_memory_consumption()).unwrap_or(0))
|
||||
.sum::<usize>()
|
||||
+ self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
|
||||
}
|
||||
|
||||
@@ -291,6 +330,11 @@ impl PerRequestAggSegCtx {
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -417,6 +461,11 @@ pub(crate) fn build_segment_agg_collector(
|
||||
)?)),
|
||||
AggKind::Range => Ok(build_segment_range_collector(req, node)?),
|
||||
AggKind::Filter => build_segment_filter_collector(req, node),
|
||||
AggKind::Composite => Ok(Box::new(
|
||||
crate::aggregation::bucket::SegmentCompositeCollector::from_req_and_validate(
|
||||
req, node,
|
||||
)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -447,6 +496,7 @@ pub enum AggKind {
|
||||
DateHistogram,
|
||||
Range,
|
||||
Filter,
|
||||
Composite,
|
||||
}
|
||||
|
||||
impl AggKind {
|
||||
@@ -462,6 +512,7 @@ impl AggKind {
|
||||
AggKind::DateHistogram => "DateHistogram",
|
||||
AggKind::Range => "Range",
|
||||
AggKind::Filter => "Filter",
|
||||
AggKind::Composite => "Composite",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -469,7 +520,7 @@ impl AggKind {
|
||||
/// Build AggregationsData by walking the request tree.
|
||||
pub(crate) fn build_aggregations_data_from_req(
|
||||
aggs: &Aggregations,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
context: AggContextParams,
|
||||
) -> crate::Result<AggregationsSegmentCtx> {
|
||||
@@ -489,7 +540,7 @@ pub(crate) fn build_aggregations_data_from_req(
|
||||
fn build_nodes(
|
||||
agg_name: &str,
|
||||
req: &Aggregation,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
data: &mut AggregationsSegmentCtx,
|
||||
is_top_level: bool,
|
||||
@@ -709,6 +760,14 @@ fn build_nodes(
|
||||
children,
|
||||
}])
|
||||
}
|
||||
AggregationVariants::Composite(composite_req) => Ok(vec![build_composite_node(
|
||||
agg_name,
|
||||
reader,
|
||||
segment_ordinal,
|
||||
data,
|
||||
&req.sub_aggregation,
|
||||
composite_req,
|
||||
)?]),
|
||||
AggregationVariants::Filter(filter_req) => {
|
||||
// Build the query and evaluator upfront
|
||||
let schema = reader.schema();
|
||||
@@ -728,7 +787,7 @@ fn build_nodes(
|
||||
let idx_in_req_data = data.push_filter_req_data(FilterAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: filter_req.clone(),
|
||||
segment_reader: reader.clone_arc(),
|
||||
segment_reader: reader.clone(),
|
||||
evaluator,
|
||||
matching_docs_buffer,
|
||||
is_top_level,
|
||||
@@ -743,9 +802,38 @@ fn build_nodes(
|
||||
}
|
||||
}
|
||||
|
||||
fn build_composite_node(
|
||||
agg_name: &str,
|
||||
reader: &SegmentReader,
|
||||
_segment_ordinal: SegmentOrdinal,
|
||||
data: &mut AggregationsSegmentCtx,
|
||||
sub_aggs: &Aggregations,
|
||||
req: &CompositeAggregation,
|
||||
) -> crate::Result<AggRefNode> {
|
||||
let mut composite_accessors = Vec::with_capacity(req.sources.len());
|
||||
for source in &req.sources {
|
||||
let source_after_key_opt = req.after.get(source.name()).map(|k| &k.0);
|
||||
let source_accessor =
|
||||
CompositeSourceAccessors::build_for_source(reader, source, source_after_key_opt)?;
|
||||
composite_accessors.push(source_accessor);
|
||||
}
|
||||
let agg = CompositeAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: req.clone(),
|
||||
composite_accessors,
|
||||
};
|
||||
let idx = data.push_composite_req_data(agg);
|
||||
let children = build_children(sub_aggs, reader, _segment_ordinal, data)?;
|
||||
Ok(AggRefNode {
|
||||
kind: AggKind::Composite,
|
||||
idx_in_req_data: idx,
|
||||
children,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_children(
|
||||
aggs: &Aggregations,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<Vec<AggRefNode>> {
|
||||
@@ -764,7 +852,7 @@ fn build_children(
|
||||
}
|
||||
|
||||
fn get_term_agg_accessors(
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
missing: &Option<Key>,
|
||||
) -> crate::Result<Vec<(Column<u64>, ColumnType)>> {
|
||||
@@ -817,7 +905,7 @@ fn build_terms_or_cardinality_nodes(
|
||||
agg_name: &str,
|
||||
field_name: &str,
|
||||
missing: &Option<Key>,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
data: &mut AggregationsSegmentCtx,
|
||||
sub_aggs: &Aggregations,
|
||||
|
||||
@@ -32,8 +32,8 @@ use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::{
|
||||
DateHistogramAggregationReq, FilterAggregation, HistogramAggregation, RangeAggregation,
|
||||
TermsAggregation,
|
||||
CompositeAggregation, DateHistogramAggregationReq, FilterAggregation, HistogramAggregation,
|
||||
RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
@@ -134,6 +134,9 @@ pub enum AggregationVariants {
|
||||
/// Filter documents into a single bucket.
|
||||
#[serde(rename = "filter")]
|
||||
Filter(FilterAggregation),
|
||||
/// Multi-dimensional, paginable bucket aggregation.
|
||||
#[serde(rename = "composite")]
|
||||
Composite(CompositeAggregation),
|
||||
|
||||
// Metric aggregation types
|
||||
/// Computes the average of the extracted values.
|
||||
@@ -180,6 +183,11 @@ impl AggregationVariants {
|
||||
AggregationVariants::Histogram(histogram) => vec![histogram.field.as_str()],
|
||||
AggregationVariants::DateHistogram(histogram) => vec![histogram.field.as_str()],
|
||||
AggregationVariants::Filter(filter) => filter.get_fast_field_names(),
|
||||
AggregationVariants::Composite(composite) => composite
|
||||
.sources
|
||||
.iter()
|
||||
.map(|source| source.field())
|
||||
.collect(),
|
||||
AggregationVariants::Average(avg) => vec![avg.field_name()],
|
||||
AggregationVariants::Count(count) => vec![count.field_name()],
|
||||
AggregationVariants::Max(max) => vec![max.field_name()],
|
||||
@@ -214,6 +222,12 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_composite(&self) -> Option<&CompositeAggregation> {
|
||||
match &self {
|
||||
AggregationVariants::Composite(composite) => Some(composite),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
|
||||
match &self {
|
||||
AggregationVariants::Percentiles(percentile_req) => Some(percentile_req),
|
||||
|
||||
@@ -9,10 +9,12 @@ use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::GetDocCount;
|
||||
use super::intermediate_agg_result::CompositeIntermediateKey;
|
||||
use super::metric::{
|
||||
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
|
||||
};
|
||||
use super::{AggregationError, Key};
|
||||
use crate::aggregation::bucket::AfterKey;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -158,6 +160,14 @@ pub enum BucketResult {
|
||||
},
|
||||
/// This is the filter result - a single bucket with sub-aggregations
|
||||
Filter(FilterBucketResult),
|
||||
/// This is the composite result
|
||||
Composite {
|
||||
/// The buckets
|
||||
buckets: Vec<CompositeBucketEntry>,
|
||||
/// The key to start after when paginating
|
||||
#[serde(skip_serializing_if = "FxHashMap::is_empty")]
|
||||
after_key: FxHashMap<String, AfterKey>,
|
||||
},
|
||||
}
|
||||
|
||||
impl BucketResult {
|
||||
@@ -179,6 +189,9 @@ impl BucketResult {
|
||||
// Only count sub-aggregation buckets
|
||||
filter_result.sub_aggregations.get_bucket_count()
|
||||
}
|
||||
BucketResult::Composite { buckets, .. } => {
|
||||
buckets.iter().map(|bucket| bucket.get_bucket_count()).sum()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -337,3 +350,87 @@ pub struct FilterBucketResult {
|
||||
#[serde(flatten)]
|
||||
pub sub_aggregations: AggregationResults,
|
||||
}
|
||||
|
||||
/// Note the type information loss compared to `CompositeIntermediateKey`.
|
||||
/// Pagination is performed using `AfterKey`, which encodes type information.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum CompositeKey {
|
||||
/// Boolean key
|
||||
Bool(bool),
|
||||
/// String key
|
||||
Str(String),
|
||||
/// `i64` key
|
||||
I64(i64),
|
||||
/// `u64` key
|
||||
U64(u64),
|
||||
/// `f64` key
|
||||
F64(f64),
|
||||
/// Null key
|
||||
Null,
|
||||
}
|
||||
impl Eq for CompositeKey {}
|
||||
impl std::hash::Hash for CompositeKey {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
core::mem::discriminant(self).hash(state);
|
||||
match self {
|
||||
Self::Bool(val) => val.hash(state),
|
||||
Self::Str(text) => text.hash(state),
|
||||
Self::F64(val) => val.to_bits().hash(state),
|
||||
Self::U64(val) => val.hash(state),
|
||||
Self::I64(val) => val.hash(state),
|
||||
Self::Null => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
impl PartialEq for CompositeKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match (self, other) {
|
||||
(Self::Bool(l), Self::Bool(r)) => l == r,
|
||||
(Self::Str(l), Self::Str(r)) => l == r,
|
||||
(Self::F64(l), Self::F64(r)) => l.to_bits() == r.to_bits(),
|
||||
(Self::I64(l), Self::I64(r)) => l == r,
|
||||
(Self::U64(l), Self::U64(r)) => l == r,
|
||||
(Self::Null, Self::Null) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<CompositeIntermediateKey> for CompositeKey {
|
||||
fn from(value: CompositeIntermediateKey) -> Self {
|
||||
match value {
|
||||
CompositeIntermediateKey::Str(s) => Self::Str(s),
|
||||
CompositeIntermediateKey::IpAddr(s) => {
|
||||
if let Some(ip) = s.to_ipv4_mapped() {
|
||||
Self::Str(ip.to_string())
|
||||
} else {
|
||||
Self::Str(s.to_string())
|
||||
}
|
||||
}
|
||||
CompositeIntermediateKey::F64(f) => Self::F64(f),
|
||||
CompositeIntermediateKey::Bool(f) => Self::Bool(f),
|
||||
CompositeIntermediateKey::U64(f) => Self::U64(f),
|
||||
CompositeIntermediateKey::I64(f) => Self::I64(f),
|
||||
CompositeIntermediateKey::DateTime(f) => Self::I64(f / 1_000_000), // ns to ms
|
||||
CompositeIntermediateKey::Null => Self::Null,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Composite bucket entry with a multi-dimensional key.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CompositeBucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: FxHashMap<String, CompositeKey>,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// Sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
|
||||
impl CompositeBucketEntry {
|
||||
pub(crate) fn get_bucket_count(&self) -> u64 {
|
||||
1 + self.sub_aggregation.get_bucket_count()
|
||||
}
|
||||
}
|
||||
|
||||
548
src/aggregation/bucket/composite/accessors.rs
Normal file
548
src/aggregation/bucket/composite/accessors.rs
Normal file
@@ -0,0 +1,548 @@
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{Column, ColumnType, MonotonicallyMappableToU64, StrColumn, TermOrdHit};
|
||||
|
||||
use crate::aggregation::accessor_helpers::get_numeric_or_date_column_types;
|
||||
use crate::aggregation::bucket::composite::numeric_types::num_proj;
|
||||
use crate::aggregation::bucket::composite::numeric_types::num_proj::ProjectedNumber;
|
||||
use crate::aggregation::bucket::composite::ToTypePaginationOrder;
|
||||
use crate::aggregation::bucket::{
|
||||
parse_into_milliseconds, CalendarInterval, CompositeAggregation, CompositeAggregationSource,
|
||||
MissingOrder, Order,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::CompositeIntermediateKey;
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// Contains all information required by the SegmentCompositeCollector to perform the
|
||||
/// composite aggregation on a segment.
|
||||
pub struct CompositeAggReqData {
|
||||
/// The name of the aggregation.
|
||||
pub name: String,
|
||||
/// The normalized term aggregation request.
|
||||
pub req: CompositeAggregation,
|
||||
/// Accessors for each source, each source can have multiple accessors (columns).
|
||||
pub composite_accessors: Vec<CompositeSourceAccessors>,
|
||||
}
|
||||
|
||||
impl CompositeAggReqData {
|
||||
/// Estimate the memory consumption of this struct in bytes.
|
||||
pub fn get_memory_consumption(&self) -> usize {
|
||||
std::mem::size_of::<Self>()
|
||||
+ self.composite_accessors.len() * std::mem::size_of::<CompositeSourceAccessors>()
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessors for a single column in a composite source.
|
||||
pub struct CompositeAccessor {
|
||||
/// The fast field column
|
||||
pub column: Column<u64>,
|
||||
/// The column type
|
||||
pub column_type: ColumnType,
|
||||
/// Term dictionary if the column type is Str
|
||||
///
|
||||
/// Only used by term sources
|
||||
pub str_dict_column: Option<StrColumn>,
|
||||
/// Parsed date interval for date histogram sources
|
||||
pub date_histogram_interval: PrecomputedDateInterval,
|
||||
}
|
||||
|
||||
/// Accessors to all the columns that belong to the field of a composite source.
|
||||
pub struct CompositeSourceAccessors {
|
||||
/// The accessors for this source
|
||||
pub accessors: Vec<CompositeAccessor>,
|
||||
/// The key after which to start collecting results. Applies to the first
|
||||
/// column of the source.
|
||||
pub after_key: PrecomputedAfterKey,
|
||||
|
||||
/// The column index the after_key applies to. The after_key only applies to
|
||||
/// one column. Columns before should be skipped. Columns after should be
|
||||
/// kept without comparison to the after_key.
|
||||
pub after_key_accessor_idx: usize,
|
||||
|
||||
/// Whether to skip missing values because of the after_key. Skipping only
|
||||
/// applies if the value for previous columns were exactly equal to the
|
||||
/// corresponding after keys (is_on_after_key).
|
||||
pub skip_missing: bool,
|
||||
|
||||
/// The after key was set to null to indicate that the last collected key
|
||||
/// was a missing value.
|
||||
pub is_after_key_explicit_missing: bool,
|
||||
}
|
||||
|
||||
impl CompositeSourceAccessors {
|
||||
/// Creates a new set of accessors for the composite source.
|
||||
///
|
||||
/// Precomputes some values to make collection faster.
|
||||
pub fn build_for_source(
|
||||
reader: &SegmentReader,
|
||||
source: &CompositeAggregationSource,
|
||||
// First option is None when no after key was set in the query, the
|
||||
// second option is None when the after key was set but its value for
|
||||
// this source was set to `null`
|
||||
source_after_key_opt: Option<&CompositeIntermediateKey>,
|
||||
) -> crate::Result<Self> {
|
||||
let is_after_key_explicit_missing = source_after_key_opt
|
||||
.map(|after_key| matches!(after_key, CompositeIntermediateKey::Null))
|
||||
.unwrap_or(false);
|
||||
let mut skip_missing = false;
|
||||
if let Some(CompositeIntermediateKey::Null) = source_after_key_opt {
|
||||
if !source.missing_bucket() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"the 'after' key for a source cannot be null when 'missing_bucket' is false"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
} else if source_after_key_opt.is_some() {
|
||||
// if missing buckets come first and we have a non null after key, we skip missing
|
||||
if MissingOrder::First == source.missing_order() {
|
||||
skip_missing = true;
|
||||
}
|
||||
if MissingOrder::Default == source.missing_order() && Order::Asc == source.order() {
|
||||
skip_missing = true;
|
||||
}
|
||||
};
|
||||
|
||||
match source {
|
||||
CompositeAggregationSource::Terms(source) => {
|
||||
let allowed_column_types = [
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Str,
|
||||
ColumnType::DateTime,
|
||||
ColumnType::Bool,
|
||||
ColumnType::IpAddr,
|
||||
// ColumnType::Bytes Unsupported
|
||||
];
|
||||
let mut columns_and_types = reader
|
||||
.fast_fields()
|
||||
.u64_lenient_for_type_all(Some(&allowed_column_types), &source.field)?;
|
||||
|
||||
// Sort columns by their pagination order and determine which to skip
|
||||
columns_and_types.sort_by_key(|(_, col_type): &(Column, ColumnType)| {
|
||||
col_type.column_pagination_order()
|
||||
});
|
||||
if source.order == Order::Desc {
|
||||
columns_and_types.reverse();
|
||||
}
|
||||
let after_key_accessor_idx = find_first_column_to_collect(
|
||||
&columns_and_types,
|
||||
source_after_key_opt,
|
||||
source.missing_order,
|
||||
source.order,
|
||||
)?;
|
||||
|
||||
let source_collectors: Vec<CompositeAccessor> = columns_and_types
|
||||
.into_iter()
|
||||
.map(|(column, column_type)| {
|
||||
Ok(CompositeAccessor {
|
||||
column,
|
||||
column_type,
|
||||
str_dict_column: reader.fast_fields().str(&source.field)?,
|
||||
date_histogram_interval: PrecomputedDateInterval::NotApplicable,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let after_key = if let Some(first_col) =
|
||||
source_collectors.get(after_key_accessor_idx)
|
||||
{
|
||||
match source_after_key_opt {
|
||||
Some(after_key) => PrecomputedAfterKey::precompute(
|
||||
&first_col,
|
||||
after_key,
|
||||
&source.field,
|
||||
source.missing_order,
|
||||
source.order,
|
||||
)?,
|
||||
None => {
|
||||
precompute_missing_after_key(false, source.missing_order, source.order)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// if no columns, we don't care about the after_key
|
||||
PrecomputedAfterKey::Next(0)
|
||||
};
|
||||
|
||||
Ok(CompositeSourceAccessors {
|
||||
accessors: source_collectors,
|
||||
is_after_key_explicit_missing,
|
||||
skip_missing,
|
||||
after_key,
|
||||
after_key_accessor_idx,
|
||||
})
|
||||
}
|
||||
CompositeAggregationSource::Histogram(source) => {
|
||||
let column_and_types: Vec<(Column, ColumnType)> =
|
||||
reader.fast_fields().u64_lenient_for_type_all(
|
||||
Some(get_numeric_or_date_column_types()),
|
||||
&source.field,
|
||||
)?;
|
||||
let source_collectors: Vec<CompositeAccessor> = column_and_types
|
||||
.into_iter()
|
||||
.map(|(column, column_type)| {
|
||||
Ok(CompositeAccessor {
|
||||
column,
|
||||
column_type,
|
||||
str_dict_column: None,
|
||||
date_histogram_interval: PrecomputedDateInterval::NotApplicable,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
let after_key = match source_after_key_opt {
|
||||
Some(CompositeIntermediateKey::F64(key)) => {
|
||||
let normalized_key = *key / source.interval;
|
||||
num_proj::f64_to_i64(normalized_key).into()
|
||||
}
|
||||
Some(CompositeIntermediateKey::Null) => {
|
||||
precompute_missing_after_key(true, source.missing_order, source.order)
|
||||
}
|
||||
None => precompute_missing_after_key(true, source.missing_order, source.order),
|
||||
_ => {
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"After key type invalid for interval composite source".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(CompositeSourceAccessors {
|
||||
accessors: source_collectors,
|
||||
is_after_key_explicit_missing,
|
||||
skip_missing,
|
||||
after_key,
|
||||
after_key_accessor_idx: 0,
|
||||
})
|
||||
}
|
||||
CompositeAggregationSource::DateHistogram(source) => {
|
||||
let column_and_types = reader
|
||||
.fast_fields()
|
||||
.u64_lenient_for_type_all(Some(&[ColumnType::DateTime]), &source.field)?;
|
||||
let date_histogram_interval =
|
||||
PrecomputedDateInterval::from_date_histogram_source_intervals(
|
||||
&source.fixed_interval,
|
||||
source.calendar_interval,
|
||||
)?;
|
||||
let source_collectors: Vec<CompositeAccessor> = column_and_types
|
||||
.into_iter()
|
||||
.map(|(column, column_type)| {
|
||||
Ok(CompositeAccessor {
|
||||
column,
|
||||
column_type,
|
||||
str_dict_column: None,
|
||||
date_histogram_interval,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
let after_key = match source_after_key_opt {
|
||||
Some(CompositeIntermediateKey::DateTime(key)) => {
|
||||
PrecomputedAfterKey::Exact(key.to_u64())
|
||||
}
|
||||
Some(CompositeIntermediateKey::Null) => {
|
||||
precompute_missing_after_key(true, source.missing_order, source.order)
|
||||
}
|
||||
None => precompute_missing_after_key(true, source.missing_order, source.order),
|
||||
_ => {
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"After key type invalid for interval composite source".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(CompositeSourceAccessors {
|
||||
accessors: source_collectors,
|
||||
is_after_key_explicit_missing,
|
||||
skip_missing,
|
||||
after_key,
|
||||
after_key_accessor_idx: 0,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds the index of the first column we should start collecting from to
|
||||
/// resume the pagination from the after_key.
|
||||
fn find_first_column_to_collect<T>(
|
||||
sorted_columns: &[(T, ColumnType)],
|
||||
after_key_opt: Option<&CompositeIntermediateKey>,
|
||||
missing_order: MissingOrder,
|
||||
order: Order,
|
||||
) -> crate::Result<usize> {
|
||||
let after_key = match after_key_opt {
|
||||
None => return Ok(0), // No pagination, start from beginning
|
||||
Some(key) => key,
|
||||
};
|
||||
// Handle null after_key (we were on a missing value last time)
|
||||
if matches!(after_key, CompositeIntermediateKey::Null) {
|
||||
return match (missing_order, order) {
|
||||
// Missing values come first, so all columns remain
|
||||
(MissingOrder::First, _) | (MissingOrder::Default, Order::Asc) => Ok(0),
|
||||
// Missing values come last, so all columns are done
|
||||
(MissingOrder::Last, _) | (MissingOrder::Default, Order::Desc) => {
|
||||
Ok(sorted_columns.len())
|
||||
}
|
||||
};
|
||||
}
|
||||
// Find the first column whose type order matches or follows the after_key's
|
||||
// type in the pagination sequence
|
||||
let after_key_column_order = after_key.column_pagination_order();
|
||||
for (idx, (_, col_type)) in sorted_columns.iter().enumerate() {
|
||||
let col_order = col_type.column_pagination_order();
|
||||
let is_first_to_collect = match order {
|
||||
Order::Asc => col_order >= after_key_column_order,
|
||||
Order::Desc => col_order <= after_key_column_order,
|
||||
};
|
||||
if is_first_to_collect {
|
||||
return Ok(idx);
|
||||
}
|
||||
}
|
||||
// All columns are before the after_key, nothing left to collect
|
||||
Ok(sorted_columns.len())
|
||||
}
|
||||
|
||||
fn precompute_missing_after_key(
|
||||
is_after_key_explicit_missing: bool,
|
||||
missing_order: MissingOrder,
|
||||
order: Order,
|
||||
) -> PrecomputedAfterKey {
|
||||
let after_last = PrecomputedAfterKey::AfterLast;
|
||||
let before_first = PrecomputedAfterKey::Next(0);
|
||||
match (is_after_key_explicit_missing, missing_order, order) {
|
||||
(true, MissingOrder::First, Order::Asc) => before_first,
|
||||
(true, MissingOrder::First, Order::Desc) => after_last,
|
||||
(true, MissingOrder::Last, Order::Asc) => after_last,
|
||||
(true, MissingOrder::Last, Order::Desc) => before_first,
|
||||
(true, MissingOrder::Default, Order::Asc) => before_first,
|
||||
(true, MissingOrder::Default, Order::Desc) => after_last,
|
||||
(false, _, Order::Asc) => before_first,
|
||||
(false, _, Order::Desc) => after_last,
|
||||
}
|
||||
}
|
||||
|
||||
/// A parsed representation of the date interval for date histogram sources
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PrecomputedDateInterval {
|
||||
/// This is not a date histogram source
|
||||
NotApplicable,
|
||||
/// Source was configured with a fixed interval
|
||||
FixedNanoseconds(i64),
|
||||
/// Source was configured with a calendar interval
|
||||
Calendar(CalendarInterval),
|
||||
}
|
||||
|
||||
impl PrecomputedDateInterval {
|
||||
/// Validates the date histogram source interval fields and parses a date interval from them.
|
||||
pub fn from_date_histogram_source_intervals(
|
||||
fixed_interval: &Option<String>,
|
||||
calendar_interval: Option<CalendarInterval>,
|
||||
) -> crate::Result<Self> {
|
||||
match (fixed_interval, calendar_interval) {
|
||||
(Some(_), Some(_)) | (None, None) => Err(TantivyError::InvalidArgument(
|
||||
"date histogram source must one and only one of fixed_interval or \
|
||||
calendar_interval set"
|
||||
.to_string(),
|
||||
)),
|
||||
(Some(fixed_interval), None) => {
|
||||
let fixed_interval_ms = parse_into_milliseconds(&fixed_interval)?;
|
||||
Ok(PrecomputedDateInterval::FixedNanoseconds(
|
||||
fixed_interval_ms * 1_000_000,
|
||||
))
|
||||
}
|
||||
(None, Some(calendar_interval)) => {
|
||||
Ok(PrecomputedDateInterval::Calendar(calendar_interval))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The after key projected to the u64 column space
|
||||
///
|
||||
/// Some column types (term, IP) might not have an exact representation of the
|
||||
/// specified after key
|
||||
#[derive(Debug)]
|
||||
pub enum PrecomputedAfterKey {
|
||||
/// The after key could be exactly represented in the column space.
|
||||
Exact(u64),
|
||||
/// The after key could not be exactly represented exactly represented, so
|
||||
/// this is the next closest one.
|
||||
Next(u64),
|
||||
/// The after key could not be represented in the column space, it is
|
||||
/// greater than all value
|
||||
AfterLast,
|
||||
}
|
||||
|
||||
impl From<TermOrdHit> for PrecomputedAfterKey {
|
||||
fn from(hit: TermOrdHit) -> Self {
|
||||
match hit {
|
||||
TermOrdHit::Exact(ord) => PrecomputedAfterKey::Exact(ord),
|
||||
// TermOrdHit represents AfterLast as Next(u64::MAX), we keep it as is
|
||||
TermOrdHit::Next(ord) => PrecomputedAfterKey::Next(ord),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: MonotonicallyMappableToU64> From<ProjectedNumber<T>> for PrecomputedAfterKey {
|
||||
fn from(num: ProjectedNumber<T>) -> Self {
|
||||
match num {
|
||||
ProjectedNumber::Exact(number) => PrecomputedAfterKey::Exact(number.to_u64()),
|
||||
ProjectedNumber::Next(number) => PrecomputedAfterKey::Next(number.to_u64()),
|
||||
ProjectedNumber::AfterLast => PrecomputedAfterKey::AfterLast,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// /!\ These operators only makes sense if both values are in the same column space
|
||||
impl PrecomputedAfterKey {
|
||||
pub fn equals(&self, column_value: u64) -> bool {
|
||||
match self {
|
||||
PrecomputedAfterKey::Exact(v) => *v == column_value,
|
||||
PrecomputedAfterKey::Next(_) => false,
|
||||
PrecomputedAfterKey::AfterLast => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gt(&self, column_value: u64) -> bool {
|
||||
match self {
|
||||
PrecomputedAfterKey::Exact(v) => *v > column_value,
|
||||
PrecomputedAfterKey::Next(v) => *v > column_value,
|
||||
PrecomputedAfterKey::AfterLast => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lt(&self, column_value: u64) -> bool {
|
||||
match self {
|
||||
PrecomputedAfterKey::Exact(v) => *v < column_value,
|
||||
// a value equal to the next is greater than the after key
|
||||
PrecomputedAfterKey::Next(v) => *v <= column_value,
|
||||
PrecomputedAfterKey::AfterLast => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn precompute_ip_addr(column: &Column<u64>, key: &Ipv6Addr) -> crate::Result<Self> {
|
||||
// For IP addresses we need to find the compact space value.
|
||||
// We try to convert via the column's min/max range scan.
|
||||
// Since CompactSpaceU64Accessor::u128_to_compact is not public,
|
||||
// we search linearly for the exact u64 value by scanning column values.
|
||||
let ip_u128 = key.to_bits();
|
||||
|
||||
// Scan for matching value - IP columns are typically small
|
||||
let num_vals = column.values.num_vals();
|
||||
let mut found_exact = false;
|
||||
let mut exact_compact = 0u64;
|
||||
let mut best_next: Option<u64> = None;
|
||||
|
||||
for doc_id in 0..num_vals {
|
||||
let val = column.values.get_val(doc_id);
|
||||
// We need the CompactSpaceU64Accessor to convert compact to u128
|
||||
let compact_accessor = column
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"type mismatch: could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
let val_u128 = compact_accessor.compact_to_u128(val as u32);
|
||||
if val_u128 == ip_u128 {
|
||||
found_exact = true;
|
||||
exact_compact = val;
|
||||
break;
|
||||
} else if val_u128 > ip_u128 {
|
||||
match best_next {
|
||||
None => best_next = Some(val),
|
||||
Some(current_best) => {
|
||||
let current_u128 = compact_accessor.compact_to_u128(current_best as u32);
|
||||
if val_u128 < current_u128 {
|
||||
best_next = Some(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if found_exact {
|
||||
Ok(PrecomputedAfterKey::Exact(exact_compact))
|
||||
} else if let Some(next) = best_next {
|
||||
Ok(PrecomputedAfterKey::Next(next))
|
||||
} else {
|
||||
Ok(PrecomputedAfterKey::AfterLast)
|
||||
}
|
||||
}
|
||||
|
||||
fn precompute_term_ord(
|
||||
str_dict_column: &Option<StrColumn>,
|
||||
key: &str,
|
||||
field: &str,
|
||||
) -> crate::Result<Self> {
|
||||
let dict = str_dict_column
|
||||
.as_ref()
|
||||
.expect("dictionary missing for str accessor")
|
||||
.dictionary();
|
||||
let next_ord = dict.term_ord_or_next(key).map_err(|_| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"failed to lookup after_key '{}' for field '{}'",
|
||||
key, field
|
||||
))
|
||||
})?;
|
||||
Ok(next_ord.into())
|
||||
}
|
||||
|
||||
/// Projects the after key into the column space of the given accessor.
|
||||
///
|
||||
/// The computed after key will not take care of skipping entire columns
|
||||
/// when the after key type is ordered after the accessor's type, that
|
||||
/// should be performed earlier.
|
||||
pub fn precompute(
|
||||
composite_accessor: &CompositeAccessor,
|
||||
source_after_key: &CompositeIntermediateKey,
|
||||
field: &str,
|
||||
missing_order: MissingOrder,
|
||||
order: Order,
|
||||
) -> crate::Result<Self> {
|
||||
use CompositeIntermediateKey as CIKey;
|
||||
let precomputed_key = match (composite_accessor.column_type, source_after_key) {
|
||||
(ColumnType::Bytes, _) => panic!("unsupported"),
|
||||
// null after key
|
||||
(_, CIKey::Null) => precompute_missing_after_key(false, missing_order, order),
|
||||
// numerical
|
||||
(ColumnType::I64, CIKey::I64(k)) => PrecomputedAfterKey::Exact(k.to_u64()),
|
||||
(ColumnType::I64, CIKey::U64(k)) => num_proj::u64_to_i64(*k).into(),
|
||||
(ColumnType::I64, CIKey::F64(k)) => num_proj::f64_to_i64(*k).into(),
|
||||
(ColumnType::U64, CIKey::I64(k)) => num_proj::i64_to_u64(*k).into(),
|
||||
(ColumnType::U64, CIKey::U64(k)) => PrecomputedAfterKey::Exact(*k),
|
||||
(ColumnType::U64, CIKey::F64(k)) => num_proj::f64_to_u64(*k).into(),
|
||||
(ColumnType::F64, CIKey::I64(k)) => num_proj::i64_to_f64(*k).into(),
|
||||
(ColumnType::F64, CIKey::U64(k)) => num_proj::u64_to_f64(*k).into(),
|
||||
(ColumnType::F64, CIKey::F64(k)) => PrecomputedAfterKey::Exact(k.to_u64()),
|
||||
// boolean
|
||||
(ColumnType::Bool, CIKey::Bool(key)) => PrecomputedAfterKey::Exact(key.to_u64()),
|
||||
// string
|
||||
(ColumnType::Str, CIKey::Str(key)) => PrecomputedAfterKey::precompute_term_ord(
|
||||
&composite_accessor.str_dict_column,
|
||||
key,
|
||||
field,
|
||||
)?,
|
||||
// date time
|
||||
(ColumnType::DateTime, CIKey::DateTime(key)) => {
|
||||
PrecomputedAfterKey::Exact(key.to_u64())
|
||||
}
|
||||
// ip address
|
||||
(ColumnType::IpAddr, CIKey::IpAddr(key)) => {
|
||||
PrecomputedAfterKey::precompute_ip_addr(&composite_accessor.column, key)?
|
||||
}
|
||||
// assume the column's type is ordered after the after_key's type
|
||||
_ => PrecomputedAfterKey::keep_all(order),
|
||||
};
|
||||
Ok(precomputed_key)
|
||||
}
|
||||
|
||||
fn keep_all(order: Order) -> Self {
|
||||
match order {
|
||||
Order::Asc => PrecomputedAfterKey::Next(0),
|
||||
Order::Desc => PrecomputedAfterKey::Next(u64::MAX),
|
||||
}
|
||||
}
|
||||
}
|
||||
140
src/aggregation/bucket/composite/calendar_interval.rs
Normal file
140
src/aggregation/bucket/composite/calendar_interval.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
use time::convert::{Day, Nanosecond};
|
||||
use time::{Time, UtcDateTime};
|
||||
|
||||
const NS_IN_DAY: i64 = Nanosecond::per_t::<i128>(Day) as i64;
|
||||
|
||||
/// Computes the timestamp in nanoseconds corresponding to the beginning of the
|
||||
/// year (January 1st at midnight UTC).
|
||||
pub(super) fn try_year_bucket(timestamp_ns: i64) -> crate::Result<i64> {
|
||||
year_bucket_using_time_crate(timestamp_ns).map_err(|e| {
|
||||
crate::TantivyError::InvalidArgument(format!(
|
||||
"Failed to compute year bucket for timestamp {}: {}",
|
||||
timestamp_ns,
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Computes the timestamp in nanoseconds corresponding to the beginning of the
|
||||
/// month (1st at midnight UTC).
|
||||
pub(super) fn try_month_bucket(timestamp_ns: i64) -> crate::Result<i64> {
|
||||
month_bucket_using_time_crate(timestamp_ns).map_err(|e| {
|
||||
crate::TantivyError::InvalidArgument(format!(
|
||||
"Failed to compute month bucket for timestamp {}: {}",
|
||||
timestamp_ns,
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Computes the timestamp in nanoseconds corresponding to the beginning of the
|
||||
/// week (Monday at midnight UTC).
|
||||
pub(super) fn week_bucket(timestamp_ns: i64) -> i64 {
|
||||
// 1970-01-01 was a Thursday (weekday = 4)
|
||||
let days_since_epoch = timestamp_ns.div_euclid(NS_IN_DAY);
|
||||
// Find the weekday: 0=Monday, ..., 6=Sunday
|
||||
let weekday = (days_since_epoch + 3).rem_euclid(7);
|
||||
let monday_days_since_epoch = days_since_epoch - weekday;
|
||||
monday_days_since_epoch * NS_IN_DAY
|
||||
}
|
||||
|
||||
fn year_bucket_using_time_crate(timestamp_ns: i64) -> Result<i64, time::Error> {
|
||||
let timestamp_ns = UtcDateTime::from_unix_timestamp_nanos(timestamp_ns as i128)?
|
||||
.replace_ordinal(1)?
|
||||
.replace_time(Time::MIDNIGHT)
|
||||
.unix_timestamp_nanos();
|
||||
Ok(timestamp_ns as i64)
|
||||
}
|
||||
|
||||
fn month_bucket_using_time_crate(timestamp_ns: i64) -> Result<i64, time::Error> {
|
||||
let timestamp_ns = UtcDateTime::from_unix_timestamp_nanos(timestamp_ns as i128)?
|
||||
.replace_day(1)?
|
||||
.replace_time(Time::MIDNIGHT)
|
||||
.unix_timestamp_nanos();
|
||||
Ok(timestamp_ns as i64)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::i64;
|
||||
|
||||
use time::format_description::well_known::Iso8601;
|
||||
use time::UtcDateTime;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn ts_ns(iso: &str) -> i64 {
|
||||
UtcDateTime::parse(iso, &Iso8601::DEFAULT)
|
||||
.unwrap()
|
||||
.unix_timestamp_nanos() as i64
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_year_bucket() {
|
||||
let ts = ts_ns("1970-01-01T00:00:00Z");
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("1970-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-06-01T10:00:01.010Z");
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("1970-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("2008-12-31T23:59:59.999999999Z"); // leap year
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("2008-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("2008-01-01T00:00:00Z"); // leap year
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("2008-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("2010-12-31T23:59:59.999999999Z");
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("2010-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1972-06-01T00:10:00Z");
|
||||
let res = try_year_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("1972-01-01T00:00:00Z"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_month_bucket() {
|
||||
let ts = ts_ns("1970-01-15T00:00:00Z");
|
||||
let res = try_month_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("1970-01-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-02-01T00:00:00Z");
|
||||
let res = try_month_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("1970-02-01T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("2000-01-31T23:59:59.999999999Z");
|
||||
let res = try_month_bucket(ts).unwrap();
|
||||
assert_eq!(res, ts_ns("2000-01-01T00:00:00Z"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_week_bucket() {
|
||||
let ts = ts_ns("1970-01-05T00:00:00Z"); // Monday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("1970-01-05T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-01-05T23:59:59Z"); // Monday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("1970-01-05T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-01-07T01:13:00Z"); // Wednesday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("1970-01-05T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-01-11T23:59:59.999999999Z"); // Sunday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("1970-01-05T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("2025-10-16T10:41:59.010Z"); // Thursday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("2025-10-13T00:00:00Z"));
|
||||
|
||||
let ts = ts_ns("1970-01-01T00:00:00Z"); // Thursday
|
||||
let res = week_bucket(ts);
|
||||
assert_eq!(res, ts_ns("1969-12-29T00:00:00Z")); // Negative
|
||||
}
|
||||
}
|
||||
674
src/aggregation/bucket/composite/collector.rs
Normal file
674
src/aggregation/bucket/composite/collector.rs
Normal file
@@ -0,0 +1,674 @@
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{
|
||||
Column, ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
NumericalValue, StrColumn,
|
||||
};
|
||||
use rustc_hash::FxHashMap;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::aggregation::agg_data::{
|
||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||
};
|
||||
use crate::aggregation::bucket::composite::accessors::{
|
||||
CompositeAccessor, CompositeAggReqData, PrecomputedDateInterval,
|
||||
};
|
||||
use crate::aggregation::bucket::composite::calendar_interval;
|
||||
use crate::aggregation::bucket::composite::map::{DynArrayHeapMap, MAX_DYN_ARRAY_SIZE};
|
||||
use crate::aggregation::bucket::{
|
||||
CalendarInterval, CompositeAggregationSource, MissingOrder, Order,
|
||||
};
|
||||
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardSubAggCache};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
CompositeIntermediateKey, IntermediateAggregationResult, IntermediateAggregationResults,
|
||||
IntermediateBucketResult, IntermediateCompositeBucketEntry, IntermediateCompositeBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
||||
use crate::aggregation::BucketId;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct CompositeBucketCollector {
|
||||
count: u32,
|
||||
bucket_id: BucketId,
|
||||
}
|
||||
|
||||
/// Compact sortable representation of a single source value within a composite key.
|
||||
///
|
||||
/// The struct encodes both the column identity and the fast field value in a way
|
||||
/// that preserves the desired sort order via the derived `Ord` implementation
|
||||
/// (fields are compared top-to-bottom: `sort_key` first, then `encoded_value`).
|
||||
///
|
||||
/// ## `sort_key` encoding
|
||||
/// - `0` — missing value, sorted first
|
||||
/// - `1..=254` — present value; the original accessor index is `sort_key - 1`
|
||||
/// - `u8::MAX` (255) — missing value, sorted last
|
||||
///
|
||||
/// ## `encoded_value` encoding
|
||||
/// - `0` when the field is missing
|
||||
/// - The raw u64 fast-field representation when order is ascending
|
||||
/// - Bitwise NOT of the raw u64 when order is descending
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Default, Hash)]
|
||||
struct InternalValueRepr {
|
||||
/// Column index biased by +1 (so 0 and u8::MAX are reserved for missing sentinels).
|
||||
sort_key: u8,
|
||||
/// Fast field value, possibly bit-flipped for descending order.
|
||||
encoded_value: u64,
|
||||
}
|
||||
|
||||
impl InternalValueRepr {
|
||||
#[inline]
|
||||
fn new_term(raw: u64, accessor_idx: u8, order: Order) -> Self {
|
||||
let encoded_value = match order {
|
||||
Order::Asc => raw,
|
||||
Order::Desc => !raw,
|
||||
};
|
||||
InternalValueRepr {
|
||||
sort_key: accessor_idx + 1,
|
||||
encoded_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// For histogram sources the column index is irrelevant (always 1).
|
||||
#[inline]
|
||||
fn new_histogram(raw: u64, order: Order) -> Self {
|
||||
let encoded_value = match order {
|
||||
Order::Asc => raw,
|
||||
Order::Desc => !raw,
|
||||
};
|
||||
InternalValueRepr {
|
||||
sort_key: 1,
|
||||
encoded_value,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new_missing(order: Order, missing_order: MissingOrder) -> Self {
|
||||
let sort_key = match (missing_order, order) {
|
||||
(MissingOrder::First, _) | (MissingOrder::Default, Order::Asc) => 0,
|
||||
(MissingOrder::Last, _) | (MissingOrder::Default, Order::Desc) => u8::MAX,
|
||||
};
|
||||
InternalValueRepr {
|
||||
sort_key,
|
||||
encoded_value: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode back to `(accessor_idx, raw_value)`.
|
||||
/// Returns `None` when the value represents a missing field.
|
||||
#[inline]
|
||||
fn decode(self, order: Order) -> Option<(u8, u64)> {
|
||||
if self.sort_key == 0 || self.sort_key == u8::MAX {
|
||||
return None;
|
||||
}
|
||||
let raw = match order {
|
||||
Order::Asc => self.encoded_value,
|
||||
Order::Desc => !self.encoded_value,
|
||||
};
|
||||
Some((self.sort_key - 1, raw))
|
||||
}
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and
|
||||
/// does a conversion to the correct datatype.
|
||||
#[derive(Debug)]
|
||||
pub struct SegmentCompositeCollector {
|
||||
/// One DynArrayHeapMap per parent bucket.
|
||||
parent_buckets: Vec<DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>>,
|
||||
accessor_idx: usize,
|
||||
sub_agg: Option<CachedSubAggs<HighCardSubAggCache>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Number of sources, needed when creating new DynArrayHeapMaps.
|
||||
num_sources: usize,
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = agg_data
|
||||
.get_composite_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
|
||||
let buckets = self.into_intermediate_bucket_result(agg_data, parent_bucket_id)?;
|
||||
results.push(
|
||||
name,
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Composite { buckets }),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(
|
||||
&mut self,
|
||||
parent_bucket_id: BucketId,
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption();
|
||||
let composite_agg_data = agg_data.take_composite_req_data(self.accessor_idx);
|
||||
|
||||
for doc in docs {
|
||||
let mut sub_level_values = SmallVec::new();
|
||||
recursive_key_visitor(
|
||||
*doc,
|
||||
&composite_agg_data,
|
||||
0,
|
||||
&mut sub_level_values,
|
||||
&mut self.parent_buckets[parent_bucket_id as usize],
|
||||
true,
|
||||
&mut self.sub_agg,
|
||||
&mut self.bucket_id_provider,
|
||||
)?;
|
||||
}
|
||||
agg_data.put_back_composite_req_data(self.accessor_idx, composite_agg_data);
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
|
||||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data.context.limits.add_memory_consumed(mem_delta)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.flush(agg_data)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_max_bucket(
|
||||
&mut self,
|
||||
max_bucket: BucketId,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let required_len = max_bucket as usize + 1;
|
||||
while self.parent_buckets.len() < required_len {
|
||||
let map = DynArrayHeapMap::try_new(self.num_sources)?;
|
||||
self.parent_buckets.push(map);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCompositeCollector {
|
||||
fn get_memory_consumption(&self) -> u64 {
|
||||
self.parent_buckets
|
||||
.iter()
|
||||
.map(|m| m.memory_consumption())
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Self> {
|
||||
validate_req(req_data, node.idx_in_req_data)?;
|
||||
|
||||
let has_sub_aggregations = !node.children.is_empty();
|
||||
let sub_agg = if has_sub_aggregations {
|
||||
let sub_agg_collector = build_segment_agg_collectors(req_data, &node.children)?;
|
||||
Some(CachedSubAggs::new(sub_agg_collector))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let composite_req_data = req_data.get_composite_req_data(node.idx_in_req_data);
|
||||
let num_sources = composite_req_data.req.sources.len();
|
||||
|
||||
Ok(SegmentCompositeCollector {
|
||||
parent_buckets: vec![DynArrayHeapMap::try_new(num_sources)?],
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
sub_agg,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
num_sources,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn into_intermediate_bucket_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<IntermediateCompositeBucketResult> {
|
||||
let empty_map = DynArrayHeapMap::try_new(self.num_sources)?;
|
||||
let heap_map = mem::replace(
|
||||
&mut self.parent_buckets[parent_bucket_id as usize],
|
||||
empty_map,
|
||||
);
|
||||
|
||||
let mut dict: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry> =
|
||||
Default::default();
|
||||
dict.reserve(heap_map.size());
|
||||
let composite_data = agg_data.get_composite_req_data(self.accessor_idx);
|
||||
for (key_internal_repr, agg) in heap_map.into_iter() {
|
||||
let key = resolve_key(&key_internal_repr, composite_data)?;
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg
|
||||
.get_sub_agg_collector()
|
||||
.add_intermediate_aggregation_result(
|
||||
agg_data,
|
||||
&mut sub_aggregation_res,
|
||||
agg.bucket_id,
|
||||
)?;
|
||||
}
|
||||
|
||||
dict.insert(
|
||||
key,
|
||||
IntermediateCompositeBucketEntry {
|
||||
doc_count: agg.count,
|
||||
sub_aggregation: sub_aggregation_res,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
Ok(IntermediateCompositeBucketResult {
|
||||
entries: dict,
|
||||
target_size: composite_data.req.size,
|
||||
orders: composite_data
|
||||
.req
|
||||
.sources
|
||||
.iter()
|
||||
.map(|source| match source {
|
||||
CompositeAggregationSource::Terms(t) => (t.order, t.missing_order),
|
||||
CompositeAggregationSource::Histogram(h) => (h.order, h.missing_order),
|
||||
CompositeAggregationSource::DateHistogram(d) => (d.order, d.missing_order),
|
||||
})
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_req(req_data: &mut AggregationsSegmentCtx, accessor_idx: usize) -> crate::Result<()> {
|
||||
let composite_data = req_data.get_composite_req_data(accessor_idx);
|
||||
let req = &composite_data.req;
|
||||
if req.sources.is_empty() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"composite aggregation must have at least one source".to_string(),
|
||||
));
|
||||
}
|
||||
if req.size == 0 {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"composite aggregation 'size' must be > 0".to_string(),
|
||||
));
|
||||
}
|
||||
let column_types_for_sources = composite_data.composite_accessors.iter().map(|item| {
|
||||
item.accessors
|
||||
.iter()
|
||||
.map(|a| a.column_type)
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
for column_types in column_types_for_sources {
|
||||
if column_types.len() > MAX_DYN_ARRAY_SIZE {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"composite aggregation source supports maximum {MAX_DYN_ARRAY_SIZE} sources",
|
||||
)));
|
||||
}
|
||||
if column_types.contains(&ColumnType::Bytes) {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"composite aggregation does not support 'bytes' field type".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect_bucket_with_limit(
|
||||
doc_id: crate::DocId,
|
||||
limit_num_buckets: usize,
|
||||
buckets: &mut DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>,
|
||||
key: &[InternalValueRepr],
|
||||
sub_agg: &mut Option<CachedSubAggs<HighCardSubAggCache>>,
|
||||
bucket_id_provider: &mut BucketIdProvider,
|
||||
) {
|
||||
let mut record_in_bucket = |bucket: &mut CompositeBucketCollector| {
|
||||
bucket.count += 1;
|
||||
if let Some(sub_agg) = sub_agg {
|
||||
sub_agg.push(bucket.bucket_id, doc_id);
|
||||
}
|
||||
};
|
||||
|
||||
// We still have room for buckets, just insert
|
||||
if buckets.size() < limit_num_buckets {
|
||||
let bucket = buckets.get_or_insert_with(key, || CompositeBucketCollector {
|
||||
count: 0,
|
||||
bucket_id: bucket_id_provider.next_bucket_id(),
|
||||
});
|
||||
record_in_bucket(bucket);
|
||||
return;
|
||||
}
|
||||
|
||||
// Map is full, but we can still update the bucket if it already exists
|
||||
if let Some(bucket) = buckets.get_mut(key) {
|
||||
record_in_bucket(bucket);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if the item qualifies to enter the top-k, and evict the highest if it does
|
||||
if let Some(highest_key) = buckets.peek_highest() {
|
||||
if key < highest_key {
|
||||
buckets.evict_highest();
|
||||
let bucket = buckets.get_or_insert_with(key, || CompositeBucketCollector {
|
||||
count: 0,
|
||||
bucket_id: bucket_id_provider.next_bucket_id(),
|
||||
});
|
||||
record_in_bucket(bucket);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the composite key from its internal column space representation
|
||||
/// (segment specific) into its intermediate form.
|
||||
fn resolve_key(
|
||||
internal_key: &[InternalValueRepr],
|
||||
agg_data: &CompositeAggReqData,
|
||||
) -> crate::Result<Vec<CompositeIntermediateKey>> {
|
||||
internal_key
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, val)| {
|
||||
resolve_internal_value_repr(
|
||||
*val,
|
||||
&agg_data.req.sources[idx],
|
||||
&agg_data.composite_accessors[idx].accessors,
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn resolve_internal_value_repr(
|
||||
internal_value_repr: InternalValueRepr,
|
||||
source: &CompositeAggregationSource,
|
||||
composite_accessors: &[CompositeAccessor],
|
||||
) -> crate::Result<CompositeIntermediateKey> {
|
||||
let decoded_value_opt = match source {
|
||||
CompositeAggregationSource::Terms(source) => internal_value_repr.decode(source.order),
|
||||
CompositeAggregationSource::Histogram(source) => internal_value_repr.decode(source.order),
|
||||
CompositeAggregationSource::DateHistogram(source) => {
|
||||
internal_value_repr.decode(source.order)
|
||||
}
|
||||
};
|
||||
let Some((decoded_accessor_idx, val)) = decoded_value_opt else {
|
||||
return Ok(CompositeIntermediateKey::Null);
|
||||
};
|
||||
let key = match source {
|
||||
CompositeAggregationSource::Terms(_) => {
|
||||
let CompositeAccessor {
|
||||
column_type,
|
||||
str_dict_column,
|
||||
column,
|
||||
..
|
||||
} = &composite_accessors[decoded_accessor_idx as usize];
|
||||
resolve_term(val, column_type, str_dict_column, column)?
|
||||
}
|
||||
CompositeAggregationSource::Histogram(source) => {
|
||||
CompositeIntermediateKey::F64(i64::from_u64(val) as f64 * source.interval)
|
||||
}
|
||||
CompositeAggregationSource::DateHistogram(_) => {
|
||||
CompositeIntermediateKey::DateTime(i64::from_u64(val))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(key)
|
||||
}
|
||||
|
||||
fn resolve_term(
|
||||
val: u64,
|
||||
column_type: &ColumnType,
|
||||
str_dict_column: &Option<StrColumn>,
|
||||
column: &Column,
|
||||
) -> crate::Result<CompositeIntermediateKey> {
|
||||
let key = if *column_type == ColumnType::Str {
|
||||
let fallback_dict = Dictionary::empty();
|
||||
let term_dict = str_dict_column
|
||||
.as_ref()
|
||||
.map(|el| el.dictionary())
|
||||
.unwrap_or_else(|| &fallback_dict);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
term_dict.ord_to_term(val, &mut buffer)?;
|
||||
CompositeIntermediateKey::Str(
|
||||
String::from_utf8(buffer.to_vec()).expect("could not convert to String"),
|
||||
)
|
||||
} else if *column_type == ColumnType::DateTime {
|
||||
let val = i64::from_u64(val);
|
||||
CompositeIntermediateKey::DateTime(val)
|
||||
} else if *column_type == ColumnType::Bool {
|
||||
let val = bool::from_u64(val);
|
||||
CompositeIntermediateKey::Bool(val)
|
||||
} else if *column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = column
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor".to_string(),
|
||||
))
|
||||
})?;
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
let val = Ipv6Addr::from_u128(val);
|
||||
CompositeIntermediateKey::IpAddr(val)
|
||||
} else if *column_type == ColumnType::U64 {
|
||||
CompositeIntermediateKey::U64(val)
|
||||
} else if *column_type == ColumnType::I64 {
|
||||
CompositeIntermediateKey::I64(i64::from_u64(val))
|
||||
} else {
|
||||
let val = f64::from_u64(val);
|
||||
let val: NumericalValue = val.into();
|
||||
|
||||
match val.normalize() {
|
||||
NumericalValue::U64(val) => CompositeIntermediateKey::U64(val),
|
||||
NumericalValue::I64(val) => CompositeIntermediateKey::I64(val),
|
||||
NumericalValue::F64(val) => CompositeIntermediateKey::F64(val),
|
||||
}
|
||||
};
|
||||
Ok(key)
|
||||
}
|
||||
|
||||
/// Depth-first walk of the accessors to build the composite key combinations
|
||||
/// and update the buckets.
|
||||
fn recursive_key_visitor(
|
||||
doc_id: crate::DocId,
|
||||
composite_agg_data: &CompositeAggReqData,
|
||||
source_idx_for_recursion: usize,
|
||||
sub_level_values: &mut SmallVec<[InternalValueRepr; MAX_DYN_ARRAY_SIZE]>,
|
||||
buckets: &mut DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>,
|
||||
// whether we need to consider the after_key in the following levels
|
||||
is_on_after_key: bool,
|
||||
sub_agg: &mut Option<CachedSubAggs<HighCardSubAggCache>>,
|
||||
bucket_id_provider: &mut BucketIdProvider,
|
||||
) -> crate::Result<()> {
|
||||
if source_idx_for_recursion == composite_agg_data.req.sources.len() {
|
||||
if !is_on_after_key {
|
||||
collect_bucket_with_limit(
|
||||
doc_id,
|
||||
composite_agg_data.req.size as usize,
|
||||
buckets,
|
||||
sub_level_values,
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let current_level_accessors = &composite_agg_data.composite_accessors[source_idx_for_recursion];
|
||||
let current_level_source = &composite_agg_data.req.sources[source_idx_for_recursion];
|
||||
let mut missing = true;
|
||||
for (accessor_idx, accessor) in current_level_accessors.accessors.iter().enumerate() {
|
||||
let values = accessor.column.values_for_doc(doc_id);
|
||||
for value in values {
|
||||
missing = false;
|
||||
match current_level_source {
|
||||
CompositeAggregationSource::Terms(_) => {
|
||||
let preceeds_after_key_type =
|
||||
accessor_idx < current_level_accessors.after_key_accessor_idx;
|
||||
if is_on_after_key && preceeds_after_key_type {
|
||||
break;
|
||||
}
|
||||
let matches_after_key_type =
|
||||
accessor_idx == current_level_accessors.after_key_accessor_idx;
|
||||
|
||||
if matches_after_key_type && is_on_after_key {
|
||||
let should_skip = match current_level_source.order() {
|
||||
Order::Asc => current_level_accessors.after_key.gt(value),
|
||||
Order::Desc => current_level_accessors.after_key.lt(value),
|
||||
};
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
sub_level_values.push(InternalValueRepr::new_term(
|
||||
value,
|
||||
accessor_idx as u8,
|
||||
current_level_source.order(),
|
||||
));
|
||||
let still_on_after_key =
|
||||
matches_after_key_type && current_level_accessors.after_key.equals(value);
|
||||
recursive_key_visitor(
|
||||
doc_id,
|
||||
composite_agg_data,
|
||||
source_idx_for_recursion + 1,
|
||||
sub_level_values,
|
||||
buckets,
|
||||
is_on_after_key && still_on_after_key,
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
)?;
|
||||
sub_level_values.pop();
|
||||
}
|
||||
CompositeAggregationSource::Histogram(source) => {
|
||||
let float_value = match accessor.column_type {
|
||||
ColumnType::U64 => value as f64,
|
||||
ColumnType::I64 => i64::from_u64(value) as f64,
|
||||
ColumnType::DateTime => i64::from_u64(value) as f64 / 1_000_000.,
|
||||
ColumnType::F64 => f64::from_u64(value),
|
||||
_ => {
|
||||
panic!(
|
||||
"unexpected type {:?}. This should not happen",
|
||||
accessor.column_type
|
||||
)
|
||||
}
|
||||
};
|
||||
let bucket_index = (float_value / source.interval).floor() as i64;
|
||||
let bucket_value = i64::to_u64(bucket_index);
|
||||
if is_on_after_key {
|
||||
let should_skip = match current_level_source.order() {
|
||||
Order::Asc => current_level_accessors.after_key.gt(bucket_value),
|
||||
Order::Desc => current_level_accessors.after_key.lt(bucket_value),
|
||||
};
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
sub_level_values.push(InternalValueRepr::new_histogram(
|
||||
bucket_value,
|
||||
current_level_source.order(),
|
||||
));
|
||||
let still_on_after_key = current_level_accessors.after_key.equals(bucket_value);
|
||||
recursive_key_visitor(
|
||||
doc_id,
|
||||
composite_agg_data,
|
||||
source_idx_for_recursion + 1,
|
||||
sub_level_values,
|
||||
buckets,
|
||||
is_on_after_key && still_on_after_key,
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
)?;
|
||||
sub_level_values.pop();
|
||||
}
|
||||
CompositeAggregationSource::DateHistogram(_) => {
|
||||
let value_ns = match accessor.column_type {
|
||||
ColumnType::DateTime => i64::from_u64(value),
|
||||
_ => {
|
||||
panic!(
|
||||
"unexpected type {:?}. This should not happen",
|
||||
accessor.column_type
|
||||
)
|
||||
}
|
||||
};
|
||||
let bucket_index = match accessor.date_histogram_interval {
|
||||
PrecomputedDateInterval::FixedNanoseconds(fixed_interval_ns) => {
|
||||
(value_ns / fixed_interval_ns) * fixed_interval_ns
|
||||
}
|
||||
PrecomputedDateInterval::Calendar(CalendarInterval::Year) => {
|
||||
calendar_interval::try_year_bucket(value_ns)?
|
||||
}
|
||||
PrecomputedDateInterval::Calendar(CalendarInterval::Month) => {
|
||||
calendar_interval::try_month_bucket(value_ns)?
|
||||
}
|
||||
PrecomputedDateInterval::Calendar(CalendarInterval::Week) => {
|
||||
calendar_interval::week_bucket(value_ns)
|
||||
}
|
||||
PrecomputedDateInterval::NotApplicable => {
|
||||
panic!("interval not precomputed for date histogram source")
|
||||
}
|
||||
};
|
||||
let bucket_value = i64::to_u64(bucket_index);
|
||||
if is_on_after_key {
|
||||
let should_skip = match current_level_source.order() {
|
||||
Order::Asc => current_level_accessors.after_key.gt(bucket_value),
|
||||
Order::Desc => current_level_accessors.after_key.lt(bucket_value),
|
||||
};
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
sub_level_values.push(InternalValueRepr::new_histogram(
|
||||
bucket_value,
|
||||
current_level_source.order(),
|
||||
));
|
||||
let still_on_after_key = current_level_accessors.after_key.equals(bucket_value);
|
||||
recursive_key_visitor(
|
||||
doc_id,
|
||||
composite_agg_data,
|
||||
source_idx_for_recursion + 1,
|
||||
sub_level_values,
|
||||
buckets,
|
||||
is_on_after_key && still_on_after_key,
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
)?;
|
||||
sub_level_values.pop();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
if missing && current_level_source.missing_bucket() {
|
||||
if is_on_after_key && current_level_accessors.skip_missing {
|
||||
return Ok(());
|
||||
}
|
||||
sub_level_values.push(InternalValueRepr::new_missing(
|
||||
current_level_source.order(),
|
||||
current_level_source.missing_order(),
|
||||
));
|
||||
recursive_key_visitor(
|
||||
doc_id,
|
||||
composite_agg_data,
|
||||
source_idx_for_recursion + 1,
|
||||
sub_level_values,
|
||||
buckets,
|
||||
is_on_after_key && current_level_accessors.is_after_key_explicit_missing,
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
)?;
|
||||
sub_level_values.pop();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
329
src/aggregation/bucket/composite/map.rs
Normal file
329
src/aggregation/bucket/composite/map.rs
Normal file
@@ -0,0 +1,329 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
|
||||
use rustc_hash::FxHashMap;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Map backed by a hash map for fast access and a binary heap to track the
|
||||
/// highest key. The key is an array of fixed size S.
|
||||
#[derive(Clone, Debug)]
|
||||
struct ArrayHeapMap<K: Ord, V, const S: usize> {
|
||||
pub(crate) buckets: FxHashMap<[K; S], V>,
|
||||
pub(crate) heap: BinaryHeap<[K; S]>,
|
||||
}
|
||||
|
||||
impl<K: Ord, V, const S: usize> Default for ArrayHeapMap<K, V, S> {
|
||||
fn default() -> Self {
|
||||
ArrayHeapMap {
|
||||
buckets: FxHashMap::default(),
|
||||
heap: BinaryHeap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash + Clone + Ord, V, const S: usize> ArrayHeapMap<K, V, S> {
|
||||
/// Panics if the length of `key` is not S.
|
||||
fn get_or_insert_with<F: FnOnce() -> V>(&mut self, key: &[K], f: F) -> &mut V {
|
||||
let key_array: &[K; S] = key.try_into().expect("Key length mismatch");
|
||||
self.buckets.entry(key_array.clone()).or_insert_with(|| {
|
||||
self.heap.push(key_array.clone());
|
||||
f()
|
||||
})
|
||||
}
|
||||
|
||||
/// Panics if the length of `key` is not S.
|
||||
fn get_mut(&mut self, key: &[K]) -> Option<&mut V> {
|
||||
let key_array: &[K; S] = key.try_into().expect("Key length mismatch");
|
||||
self.buckets.get_mut(key_array)
|
||||
}
|
||||
|
||||
fn peek_highest(&self) -> Option<&[K]> {
|
||||
self.heap.peek().map(|k_array| k_array.as_slice())
|
||||
}
|
||||
|
||||
fn evict_highest(&mut self) {
|
||||
if let Some(highest) = self.heap.pop() {
|
||||
self.buckets.remove(&highest);
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_consumption(&self) -> u64 {
|
||||
let key_size = std::mem::size_of::<[K; S]>();
|
||||
let map_size = (key_size + std::mem::size_of::<V>()) * self.buckets.capacity();
|
||||
let heap_size = key_size * self.heap.capacity();
|
||||
(map_size + heap_size) as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Copy + Ord + Clone + 'static, V: 'static, const S: usize> ArrayHeapMap<K, V, S> {
|
||||
fn into_iter(self) -> Box<dyn Iterator<Item = (SmallVec<[K; MAX_DYN_ARRAY_SIZE]>, V)>> {
|
||||
Box::new(
|
||||
self.buckets
|
||||
.into_iter()
|
||||
.map(|(k, v)| (SmallVec::from_slice(&k), v)),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) const MAX_DYN_ARRAY_SIZE: usize = 16;
|
||||
const MAX_DYN_ARRAY_SIZE_PLUS_ONE: usize = MAX_DYN_ARRAY_SIZE + 1;
|
||||
|
||||
/// A map optimized for memory footprint, fast access and efficient eviction of
|
||||
/// the highest key.
|
||||
///
|
||||
/// Keys are inlined arrays of size 1 to [MAX_DYN_ARRAY_SIZE] but for a given
|
||||
/// instance the key size is fixed. This allows to avoid heap allocations for the
|
||||
/// keys.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct DynArrayHeapMap<K: Ord, V>(DynArrayHeapMapInner<K, V>);
|
||||
|
||||
/// Wrapper around ArrayHeapMap to dynamically dispatch on the array size.
|
||||
#[derive(Clone, Debug)]
|
||||
enum DynArrayHeapMapInner<K: Ord, V> {
|
||||
Dim1(ArrayHeapMap<K, V, 1>),
|
||||
Dim2(ArrayHeapMap<K, V, 2>),
|
||||
Dim3(ArrayHeapMap<K, V, 3>),
|
||||
Dim4(ArrayHeapMap<K, V, 4>),
|
||||
Dim5(ArrayHeapMap<K, V, 5>),
|
||||
Dim6(ArrayHeapMap<K, V, 6>),
|
||||
Dim7(ArrayHeapMap<K, V, 7>),
|
||||
Dim8(ArrayHeapMap<K, V, 8>),
|
||||
Dim9(ArrayHeapMap<K, V, 9>),
|
||||
Dim10(ArrayHeapMap<K, V, 10>),
|
||||
Dim11(ArrayHeapMap<K, V, 11>),
|
||||
Dim12(ArrayHeapMap<K, V, 12>),
|
||||
Dim13(ArrayHeapMap<K, V, 13>),
|
||||
Dim14(ArrayHeapMap<K, V, 14>),
|
||||
Dim15(ArrayHeapMap<K, V, 15>),
|
||||
Dim16(ArrayHeapMap<K, V, 16>),
|
||||
}
|
||||
|
||||
impl<K: Ord, V> DynArrayHeapMap<K, V> {
|
||||
/// Creates a new heap map with dynamic array keys of size `key_dimension`.
|
||||
pub(super) fn try_new(key_dimension: usize) -> crate::Result<Self> {
|
||||
let inner = match key_dimension {
|
||||
0 => {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"DynArrayHeapMap dimension must be at least 1".to_string(),
|
||||
))
|
||||
}
|
||||
1 => DynArrayHeapMapInner::Dim1(ArrayHeapMap::default()),
|
||||
2 => DynArrayHeapMapInner::Dim2(ArrayHeapMap::default()),
|
||||
3 => DynArrayHeapMapInner::Dim3(ArrayHeapMap::default()),
|
||||
4 => DynArrayHeapMapInner::Dim4(ArrayHeapMap::default()),
|
||||
5 => DynArrayHeapMapInner::Dim5(ArrayHeapMap::default()),
|
||||
6 => DynArrayHeapMapInner::Dim6(ArrayHeapMap::default()),
|
||||
7 => DynArrayHeapMapInner::Dim7(ArrayHeapMap::default()),
|
||||
8 => DynArrayHeapMapInner::Dim8(ArrayHeapMap::default()),
|
||||
9 => DynArrayHeapMapInner::Dim9(ArrayHeapMap::default()),
|
||||
10 => DynArrayHeapMapInner::Dim10(ArrayHeapMap::default()),
|
||||
11 => DynArrayHeapMapInner::Dim11(ArrayHeapMap::default()),
|
||||
12 => DynArrayHeapMapInner::Dim12(ArrayHeapMap::default()),
|
||||
13 => DynArrayHeapMapInner::Dim13(ArrayHeapMap::default()),
|
||||
14 => DynArrayHeapMapInner::Dim14(ArrayHeapMap::default()),
|
||||
15 => DynArrayHeapMapInner::Dim15(ArrayHeapMap::default()),
|
||||
16 => DynArrayHeapMapInner::Dim16(ArrayHeapMap::default()),
|
||||
MAX_DYN_ARRAY_SIZE_PLUS_ONE.. => {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"DynArrayHeapMap supports maximum {MAX_DYN_ARRAY_SIZE} dimensions, got \
|
||||
{key_dimension}",
|
||||
)))
|
||||
}
|
||||
};
|
||||
Ok(DynArrayHeapMap(inner))
|
||||
}
|
||||
|
||||
/// Number of elements in the map. This is not the dimension of the keys.
|
||||
pub(super) fn size(&self) -> usize {
|
||||
match &self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.buckets.len(),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.buckets.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Ord + Hash + Clone, V> DynArrayHeapMap<K, V> {
|
||||
/// Get a mutable reference to the value corresponding to `key` or inserts a new
|
||||
/// value created by calling `f`.
|
||||
///
|
||||
/// Panics if the length of `key` does not match the key dimension of the map.
|
||||
pub(super) fn get_or_insert_with<F: FnOnce() -> V>(&mut self, key: &[K], f: F) -> &mut V {
|
||||
match &mut self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.get_or_insert_with(key, f),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.get_or_insert_with(key, f),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to the value corresponding to `key`.
|
||||
///
|
||||
/// Panics if the length of `key` does not match the key dimension of the map.
|
||||
pub fn get_mut(&mut self, key: &[K]) -> Option<&mut V> {
|
||||
match &mut self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.get_mut(key),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.get_mut(key),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a reference to the highest key in the map.
|
||||
pub(super) fn peek_highest(&self) -> Option<&[K]> {
|
||||
match &self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.peek_highest(),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.peek_highest(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes the entry with the highest key from the map.
|
||||
pub(super) fn evict_highest(&mut self) {
|
||||
match &mut self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.evict_highest(),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.evict_highest(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn memory_consumption(&self) -> u64 {
|
||||
match &self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.memory_consumption(),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.memory_consumption(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Ord + Clone + Copy + 'static, V: 'static> DynArrayHeapMap<K, V> {
|
||||
/// Turns this map into an iterator over key-value pairs.
|
||||
pub fn into_iter(self) -> impl Iterator<Item = (SmallVec<[K; MAX_DYN_ARRAY_SIZE]>, V)> {
|
||||
match self.0 {
|
||||
DynArrayHeapMapInner::Dim1(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim2(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim3(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim4(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim5(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim6(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim7(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim8(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim9(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim10(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim11(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim12(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim13(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim14(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim15(map) => map.into_iter(),
|
||||
DynArrayHeapMapInner::Dim16(map) => map.into_iter(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_dyn_array_heap_map() {
|
||||
let mut map = DynArrayHeapMap::<u32, &str>::try_new(2).unwrap();
|
||||
// insert
|
||||
let key1 = [1u32, 2u32];
|
||||
let key2 = [2u32, 1u32];
|
||||
map.get_or_insert_with(&key1, || "a");
|
||||
map.get_or_insert_with(&key2, || "b");
|
||||
assert_eq!(map.size(), 2);
|
||||
|
||||
// evict highest
|
||||
assert_eq!(map.peek_highest(), Some(&key2[..]));
|
||||
map.evict_highest();
|
||||
assert_eq!(map.size(), 1);
|
||||
assert_eq!(map.peek_highest(), Some(&key1[..]));
|
||||
|
||||
// into_iter
|
||||
let mut iter = map.into_iter();
|
||||
let (k, v) = iter.next().unwrap();
|
||||
assert_eq!(k.as_slice(), &key1);
|
||||
assert_eq!(v, "c");
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
1848
src/aggregation/bucket/composite/mod.rs
Normal file
1848
src/aggregation/bucket/composite/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
460
src/aggregation/bucket/composite/numeric_types.rs
Normal file
460
src/aggregation/bucket/composite/numeric_types.rs
Normal file
@@ -0,0 +1,460 @@
|
||||
/// This modules helps comparing numerical values of different types (i64, u64
|
||||
/// and f64).
|
||||
pub(super) mod num_cmp {
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
pub fn cmp_i64_f64(left_i: i64, right_f: f64) -> crate::Result<Ordering> {
|
||||
if right_f.is_nan() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"NaN comparison is not supported".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// If right_f is < i64::MIN then left_i > right_f (i64::MIN=-2^63 can be
|
||||
// exactly represented as f64)
|
||||
if right_f < i64::MIN as f64 {
|
||||
return Ok(Ordering::Greater);
|
||||
}
|
||||
// If right_f is >= i64::MAX then left_i < right_f (i64::MAX=2^63-1 cannot
|
||||
// be exactly represented as f64)
|
||||
if right_f >= i64::MAX as f64 {
|
||||
return Ok(Ordering::Less);
|
||||
}
|
||||
|
||||
// Now right_f is in (i64::MIN, i64::MAX), so `right_f as i64` is
|
||||
// well-defined (truncation toward 0)
|
||||
let right_as_i = right_f as i64;
|
||||
|
||||
let result = match left_i.cmp(&right_as_i) {
|
||||
Ordering::Less => Ordering::Less,
|
||||
Ordering::Greater => Ordering::Greater,
|
||||
Ordering::Equal => {
|
||||
// they have the same integer part, compare the fraction
|
||||
let rem = right_f - (right_as_i as f64);
|
||||
if rem == 0.0 {
|
||||
Ordering::Equal
|
||||
} else if right_f > 0.0 {
|
||||
Ordering::Less
|
||||
} else {
|
||||
Ordering::Greater
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn cmp_u64_f64(left_u: u64, right_f: f64) -> crate::Result<Ordering> {
|
||||
if right_f.is_nan() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"NaN comparison is not supported".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Negative floats are always less than any u64 >= 0
|
||||
if right_f < 0.0 {
|
||||
return Ok(Ordering::Greater);
|
||||
}
|
||||
|
||||
// If right_f is >= u64::MAX then left_u < right_f (u64::MAX=2^64-1 cannot be exactly)
|
||||
let max_as_f = u64::MAX as f64;
|
||||
if right_f > max_as_f {
|
||||
return Ok(Ordering::Less);
|
||||
}
|
||||
|
||||
// Now right_f is in (0, u64::MAX), so `right_f as u64` is well-defined
|
||||
// (truncation toward 0)
|
||||
let right_as_u = right_f as u64;
|
||||
|
||||
let result = match left_u.cmp(&right_as_u) {
|
||||
Ordering::Less => Ordering::Less,
|
||||
Ordering::Greater => Ordering::Greater,
|
||||
Ordering::Equal => {
|
||||
// they have the same integer part, compare the fraction
|
||||
let rem = right_f - (right_as_u as f64);
|
||||
if rem == 0.0 {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn cmp_i64_u64(left_i: i64, right_u: u64) -> Ordering {
|
||||
if left_i < 0 {
|
||||
Ordering::Less
|
||||
} else {
|
||||
let left_as_u = left_i as u64;
|
||||
left_as_u.cmp(&right_u)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This modules helps projecting numerical values to other numerical types.
|
||||
/// When the target value space cannot exactly represent the source value, the
|
||||
/// next representable value is returned (or AfterLast if the source value is
|
||||
/// larger than the largest representable value).
|
||||
///
|
||||
/// All functions in this module assume that f64 values are not NaN.
|
||||
pub(super) mod num_proj {
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum ProjectedNumber<T> {
|
||||
Exact(T),
|
||||
Next(T),
|
||||
AfterLast,
|
||||
}
|
||||
|
||||
pub fn i64_to_u64(value: i64) -> ProjectedNumber<u64> {
|
||||
if value < 0 {
|
||||
ProjectedNumber::Next(0)
|
||||
} else {
|
||||
ProjectedNumber::Exact(value as u64)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn u64_to_i64(value: u64) -> ProjectedNumber<i64> {
|
||||
if value > i64::MAX as u64 {
|
||||
ProjectedNumber::AfterLast
|
||||
} else {
|
||||
ProjectedNumber::Exact(value as i64)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn f64_to_u64(value: f64) -> ProjectedNumber<u64> {
|
||||
if value < 0.0 {
|
||||
ProjectedNumber::Next(0)
|
||||
} else if value > u64::MAX as f64 {
|
||||
ProjectedNumber::AfterLast
|
||||
} else if value.fract() == 0.0 {
|
||||
ProjectedNumber::Exact(value as u64)
|
||||
} else {
|
||||
// casting f64 to u64 truncates toward zero
|
||||
ProjectedNumber::Next(value as u64 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn f64_to_i64(value: f64) -> ProjectedNumber<i64> {
|
||||
if value < (i64::MIN as f64) {
|
||||
return ProjectedNumber::Next(i64::MIN);
|
||||
} else if value >= (i64::MAX as f64) {
|
||||
return ProjectedNumber::AfterLast;
|
||||
} else if value.fract() == 0.0 {
|
||||
ProjectedNumber::Exact(value as i64)
|
||||
} else if value > 0.0 {
|
||||
// casting f64 to i64 truncates toward zero
|
||||
ProjectedNumber::Next(value as i64 + 1)
|
||||
} else {
|
||||
ProjectedNumber::Next(value as i64)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn i64_to_f64(value: i64) -> ProjectedNumber<f64> {
|
||||
let value_f = value as f64;
|
||||
let k_roundtrip = value_f as i64;
|
||||
if k_roundtrip == value {
|
||||
// between -2^53 and 2^53 all i64 are exactly represented as f64
|
||||
ProjectedNumber::Exact(value_f)
|
||||
} else {
|
||||
// for very large/small i64 values, it is approximated to the closest f64
|
||||
if k_roundtrip > value {
|
||||
ProjectedNumber::Next(value_f)
|
||||
} else {
|
||||
ProjectedNumber::Next(value_f.next_up())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn u64_to_f64(value: u64) -> ProjectedNumber<f64> {
|
||||
let value_f = value as f64;
|
||||
let k_roundtrip = value_f as u64;
|
||||
if k_roundtrip == value {
|
||||
// between 0 and 2^53 all u64 are exactly represented as f64
|
||||
ProjectedNumber::Exact(value_f)
|
||||
} else if k_roundtrip > value {
|
||||
ProjectedNumber::Next(value_f)
|
||||
} else {
|
||||
ProjectedNumber::Next(value_f.next_up())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod num_cmp_tests {
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use super::num_cmp::*;
|
||||
|
||||
#[test]
|
||||
fn test_cmp_u64_f64() {
|
||||
// Basic comparisons
|
||||
assert_eq!(cmp_u64_f64(5, 5.0).unwrap(), Ordering::Equal);
|
||||
assert_eq!(cmp_u64_f64(5, 6.0).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_u64_f64(6, 5.0).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_u64_f64(0, 0.0).unwrap(), Ordering::Equal);
|
||||
assert_eq!(cmp_u64_f64(0, 0.1).unwrap(), Ordering::Less);
|
||||
|
||||
// Negative float values should always be less than any u64
|
||||
assert_eq!(cmp_u64_f64(0, -0.1).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_u64_f64(5, -5.0).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_u64_f64(u64::MAX, -1e20).unwrap(), Ordering::Greater);
|
||||
|
||||
// Tests with extreme values
|
||||
assert_eq!(cmp_u64_f64(u64::MAX, 1e20).unwrap(), Ordering::Less);
|
||||
|
||||
// Precision edge cases: large u64 that loses precision when converted to f64
|
||||
// => 2^54, exactly represented as f64
|
||||
let large_f64 = 18_014_398_509_481_984.0;
|
||||
let large_u64 = 18_014_398_509_481_984;
|
||||
// prove that large_u64 is exactly represented as f64
|
||||
assert_eq!(large_u64 as f64, large_f64);
|
||||
assert_eq!(cmp_u64_f64(large_u64, large_f64).unwrap(), Ordering::Equal);
|
||||
// => (2^54 + 1) cannot be exactly represented in f64
|
||||
let large_u64_plus_1 = 18_014_398_509_481_985;
|
||||
// prove that it is represented as f64 by large_f64
|
||||
assert_eq!(large_u64_plus_1 as f64, large_f64);
|
||||
assert_eq!(
|
||||
cmp_u64_f64(large_u64_plus_1, large_f64).unwrap(),
|
||||
Ordering::Greater
|
||||
);
|
||||
// => (2^54 - 1) cannot be exactly represented in f64
|
||||
let large_u64_minus_1 = 18_014_398_509_481_983;
|
||||
// prove that it is also represented as f64 by large_f64
|
||||
assert_eq!(large_u64_minus_1 as f64, large_f64);
|
||||
assert_eq!(
|
||||
cmp_u64_f64(large_u64_minus_1, large_f64).unwrap(),
|
||||
Ordering::Less
|
||||
);
|
||||
|
||||
// NaN comparison results in an error
|
||||
assert!(cmp_u64_f64(0, f64::NAN).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cmp_i64_f64() {
|
||||
// Basic comparisons
|
||||
assert_eq!(cmp_i64_f64(5, 5.0).unwrap(), Ordering::Equal);
|
||||
assert_eq!(cmp_i64_f64(5, 6.0).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(6, 5.0).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_i64_f64(-5, -5.0).unwrap(), Ordering::Equal);
|
||||
assert_eq!(cmp_i64_f64(-5, -4.0).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(-4, -5.0).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_i64_f64(-5, 5.0).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(5, -5.0).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_i64_f64(0, -0.1).unwrap(), Ordering::Greater);
|
||||
assert_eq!(cmp_i64_f64(0, 0.1).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(-1, -0.5).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(-1, 0.0).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(0, 0.0).unwrap(), Ordering::Equal);
|
||||
|
||||
// Tests with extreme values
|
||||
assert_eq!(cmp_i64_f64(i64::MAX, 1e20).unwrap(), Ordering::Less);
|
||||
assert_eq!(cmp_i64_f64(i64::MIN, -1e20).unwrap(), Ordering::Greater);
|
||||
|
||||
// Precision edge cases: large i64 that loses precision when converted to f64
|
||||
// => 2^54, exactly represented as f64
|
||||
let large_f64 = 18_014_398_509_481_984.0;
|
||||
let large_i64 = 18_014_398_509_481_984;
|
||||
// prove that large_i64 is exactly represented as f64
|
||||
assert_eq!(large_i64 as f64, large_f64);
|
||||
assert_eq!(cmp_i64_f64(large_i64, large_f64).unwrap(), Ordering::Equal);
|
||||
// => (1_i64 << 54) + 1 cannot be exactly represented in f64
|
||||
let large_i64_plus_1 = 18_014_398_509_481_985;
|
||||
// prove that it is represented as f64 by large_f64
|
||||
assert_eq!(large_i64_plus_1 as f64, large_f64);
|
||||
assert_eq!(
|
||||
cmp_i64_f64(large_i64_plus_1, large_f64).unwrap(),
|
||||
Ordering::Greater
|
||||
);
|
||||
// => (1_i64 << 54) - 1 cannot be exactly represented in f64
|
||||
let large_i64_minus_1 = 18_014_398_509_481_983;
|
||||
// prove that it is also represented as f64 by large_f64
|
||||
assert_eq!(large_i64_minus_1 as f64, large_f64);
|
||||
assert_eq!(
|
||||
cmp_i64_f64(large_i64_minus_1, large_f64).unwrap(),
|
||||
Ordering::Less
|
||||
);
|
||||
|
||||
// Same precision edge case but with negative values
|
||||
// => -2^54, exactly represented as f64
|
||||
let large_neg_f64 = -18_014_398_509_481_984.0;
|
||||
let large_neg_i64 = -18_014_398_509_481_984;
|
||||
// prove that large_neg_i64 is exactly represented as f64
|
||||
assert_eq!(large_neg_i64 as f64, large_neg_f64);
|
||||
assert_eq!(
|
||||
cmp_i64_f64(large_neg_i64, large_neg_f64).unwrap(),
|
||||
Ordering::Equal
|
||||
);
|
||||
// => (-2^54 + 1) cannot be exactly represented in f64
|
||||
let large_neg_i64_plus_1 = -18_014_398_509_481_985;
|
||||
// prove that it is represented as f64 by large_neg_f64
|
||||
assert_eq!(large_neg_i64_plus_1 as f64, large_neg_f64);
|
||||
assert_eq!(
|
||||
cmp_i64_f64(large_neg_i64_plus_1, large_neg_f64).unwrap(),
|
||||
Ordering::Less
|
||||
);
|
||||
// => (-2^54 - 1) cannot be exactly represented in f64
|
||||
let large_neg_i64_minus_1 = -18_014_398_509_481_983;
|
||||
// prove that it is also represented as f64 by large_neg_f64
|
||||
assert_eq!(large_neg_i64_minus_1 as f64, large_neg_f64);
|
||||
assert_eq!(
|
||||
cmp_i64_f64(large_neg_i64_minus_1, large_neg_f64).unwrap(),
|
||||
Ordering::Greater
|
||||
);
|
||||
|
||||
// NaN comparison results in an error
|
||||
assert!(cmp_i64_f64(0, f64::NAN).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cmp_i64_u64() {
|
||||
// Test with negative i64 values (should always be less than any u64)
|
||||
assert_eq!(cmp_i64_u64(-1, 0), Ordering::Less);
|
||||
assert_eq!(cmp_i64_u64(i64::MIN, 0), Ordering::Less);
|
||||
assert_eq!(cmp_i64_u64(i64::MIN, u64::MAX), Ordering::Less);
|
||||
|
||||
// Test with positive i64 values
|
||||
assert_eq!(cmp_i64_u64(0, 0), Ordering::Equal);
|
||||
assert_eq!(cmp_i64_u64(1, 0), Ordering::Greater);
|
||||
assert_eq!(cmp_i64_u64(1, 1), Ordering::Equal);
|
||||
assert_eq!(cmp_i64_u64(0, 1), Ordering::Less);
|
||||
assert_eq!(cmp_i64_u64(5, 10), Ordering::Less);
|
||||
assert_eq!(cmp_i64_u64(10, 5), Ordering::Greater);
|
||||
|
||||
// Test with values near i64::MAX and u64 conversion
|
||||
assert_eq!(cmp_i64_u64(i64::MAX, i64::MAX as u64), Ordering::Equal);
|
||||
assert_eq!(cmp_i64_u64(i64::MAX, (i64::MAX as u64) + 1), Ordering::Less);
|
||||
assert_eq!(cmp_i64_u64(i64::MAX, u64::MAX), Ordering::Less);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod num_proj_tests {
|
||||
use super::num_proj::{self, ProjectedNumber};
|
||||
|
||||
#[test]
|
||||
fn test_i64_to_u64() {
|
||||
assert_eq!(num_proj::i64_to_u64(-1), ProjectedNumber::Next(0));
|
||||
assert_eq!(num_proj::i64_to_u64(i64::MIN), ProjectedNumber::Next(0));
|
||||
assert_eq!(num_proj::i64_to_u64(0), ProjectedNumber::Exact(0));
|
||||
assert_eq!(num_proj::i64_to_u64(42), ProjectedNumber::Exact(42));
|
||||
assert_eq!(
|
||||
num_proj::i64_to_u64(i64::MAX),
|
||||
ProjectedNumber::Exact(i64::MAX as u64)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u64_to_i64() {
|
||||
assert_eq!(num_proj::u64_to_i64(0), ProjectedNumber::Exact(0));
|
||||
assert_eq!(num_proj::u64_to_i64(42), ProjectedNumber::Exact(42));
|
||||
assert_eq!(
|
||||
num_proj::u64_to_i64(i64::MAX as u64),
|
||||
ProjectedNumber::Exact(i64::MAX)
|
||||
);
|
||||
assert_eq!(
|
||||
num_proj::u64_to_i64((i64::MAX as u64) + 1),
|
||||
ProjectedNumber::AfterLast
|
||||
);
|
||||
assert_eq!(num_proj::u64_to_i64(u64::MAX), ProjectedNumber::AfterLast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_to_u64() {
|
||||
assert_eq!(num_proj::f64_to_u64(-1e25), ProjectedNumber::Next(0));
|
||||
assert_eq!(num_proj::f64_to_u64(-0.1), ProjectedNumber::Next(0));
|
||||
assert_eq!(num_proj::f64_to_u64(1e20), ProjectedNumber::AfterLast);
|
||||
assert_eq!(
|
||||
num_proj::f64_to_u64(f64::INFINITY),
|
||||
ProjectedNumber::AfterLast
|
||||
);
|
||||
assert_eq!(num_proj::f64_to_u64(0.0), ProjectedNumber::Exact(0));
|
||||
assert_eq!(num_proj::f64_to_u64(42.0), ProjectedNumber::Exact(42));
|
||||
assert_eq!(num_proj::f64_to_u64(0.5), ProjectedNumber::Next(1));
|
||||
assert_eq!(num_proj::f64_to_u64(42.1), ProjectedNumber::Next(43));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_to_i64() {
|
||||
assert_eq!(num_proj::f64_to_i64(-1e20), ProjectedNumber::Next(i64::MIN));
|
||||
assert_eq!(
|
||||
num_proj::f64_to_i64(f64::NEG_INFINITY),
|
||||
ProjectedNumber::Next(i64::MIN)
|
||||
);
|
||||
assert_eq!(num_proj::f64_to_i64(1e20), ProjectedNumber::AfterLast);
|
||||
assert_eq!(
|
||||
num_proj::f64_to_i64(f64::INFINITY),
|
||||
ProjectedNumber::AfterLast
|
||||
);
|
||||
assert_eq!(num_proj::f64_to_i64(0.0), ProjectedNumber::Exact(0));
|
||||
assert_eq!(num_proj::f64_to_i64(42.0), ProjectedNumber::Exact(42));
|
||||
assert_eq!(num_proj::f64_to_i64(-42.0), ProjectedNumber::Exact(-42));
|
||||
assert_eq!(num_proj::f64_to_i64(0.5), ProjectedNumber::Next(1));
|
||||
assert_eq!(num_proj::f64_to_i64(42.1), ProjectedNumber::Next(43));
|
||||
assert_eq!(num_proj::f64_to_i64(-0.5), ProjectedNumber::Next(0));
|
||||
assert_eq!(num_proj::f64_to_i64(-42.1), ProjectedNumber::Next(-42));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_to_f64() {
|
||||
assert_eq!(num_proj::i64_to_f64(0), ProjectedNumber::Exact(0.0));
|
||||
assert_eq!(num_proj::i64_to_f64(42), ProjectedNumber::Exact(42.0));
|
||||
assert_eq!(num_proj::i64_to_f64(-42), ProjectedNumber::Exact(-42.0));
|
||||
|
||||
let max_exact = 9_007_199_254_740_992; // 2^53
|
||||
assert_eq!(
|
||||
num_proj::i64_to_f64(max_exact),
|
||||
ProjectedNumber::Exact(max_exact as f64)
|
||||
);
|
||||
|
||||
// Test values that cannot be exactly represented as f64 (integers above 2^53)
|
||||
let large_i64 = 9_007_199_254_740_993; // 2^53 + 1
|
||||
let closest_f64 = 9_007_199_254_740_992.0;
|
||||
assert_eq!(large_i64 as f64, closest_f64);
|
||||
if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_i64) {
|
||||
// Verify that the returned float is different from the direct cast
|
||||
assert!(val > closest_f64);
|
||||
assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64);
|
||||
} else {
|
||||
panic!("Expected ProjectedNumber::Next for large_i64");
|
||||
}
|
||||
|
||||
// Test with very large negative value
|
||||
let large_neg_i64 = -9_007_199_254_740_993; // -(2^53 + 1)
|
||||
let closest_neg_f64 = -9_007_199_254_740_992.0;
|
||||
assert_eq!(large_neg_i64 as f64, closest_neg_f64);
|
||||
if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_neg_i64) {
|
||||
// Verify that the returned float is the closest representable f64
|
||||
assert_eq!(val, closest_neg_f64);
|
||||
} else {
|
||||
panic!("Expected ProjectedNumber::Next for large_neg_i64");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u64_to_f64() {
|
||||
assert_eq!(num_proj::u64_to_f64(0), ProjectedNumber::Exact(0.0));
|
||||
assert_eq!(num_proj::u64_to_f64(42), ProjectedNumber::Exact(42.0));
|
||||
|
||||
// Test the largest u64 value that can be exactly represented as f64 (2^53)
|
||||
let max_exact = 9_007_199_254_740_992; // 2^53
|
||||
assert_eq!(
|
||||
num_proj::u64_to_f64(max_exact),
|
||||
ProjectedNumber::Exact(max_exact as f64)
|
||||
);
|
||||
|
||||
// Test values that cannot be exactly represented as f64 (integers above 2^53)
|
||||
let large_u64 = 9_007_199_254_740_993; // 2^53 + 1
|
||||
let closest_f64 = 9_007_199_254_740_992.0;
|
||||
assert_eq!(large_u64 as f64, closest_f64);
|
||||
if let ProjectedNumber::Next(val) = num_proj::u64_to_f64(large_u64) {
|
||||
// Verify that the returned float is different from the direct cast
|
||||
assert!(val > closest_f64);
|
||||
assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64);
|
||||
} else {
|
||||
panic!("Expected ProjectedNumber::Next for large_u64");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BitSet;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -403,7 +402,7 @@ pub struct FilterAggReqData {
|
||||
/// The filter aggregation
|
||||
pub req: FilterAggregation,
|
||||
/// The segment reader
|
||||
pub segment_reader: Arc<dyn SegmentReader>,
|
||||
pub segment_reader: SegmentReader,
|
||||
/// Document evaluator for the filter query (precomputed BitSet)
|
||||
/// This is built once when the request data is created
|
||||
pub evaluator: DocumentQueryEvaluator,
|
||||
@@ -417,7 +416,7 @@ impl FilterAggReqData {
|
||||
pub(crate) fn get_memory_consumption(&self) -> usize {
|
||||
// Estimate: name + segment reader reference + bitset + buffer capacity
|
||||
self.name.len()
|
||||
+ std::mem::size_of::<Arc<dyn SegmentReader>>()
|
||||
+ std::mem::size_of::<SegmentReader>()
|
||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||
+ std::mem::size_of::<bool>()
|
||||
@@ -439,7 +438,7 @@ impl DocumentQueryEvaluator {
|
||||
pub(crate) fn new(
|
||||
query: Box<dyn Query>,
|
||||
schema: Schema,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self> {
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
|
||||
@@ -207,7 +207,7 @@ fn parse_offset_into_milliseconds(input: &str) -> Result<i64, AggregationError>
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
||||
pub(crate) fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
||||
let split_boundary = input
|
||||
.as_bytes()
|
||||
.iter()
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
//! - [Range](RangeAggregation)
|
||||
//! - [Terms](TermsAggregation)
|
||||
|
||||
mod composite;
|
||||
mod filter;
|
||||
mod histogram;
|
||||
mod range;
|
||||
@@ -31,6 +32,7 @@ mod term_missing_agg;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
|
||||
pub use composite::*;
|
||||
pub use filter::*;
|
||||
pub use histogram::*;
|
||||
pub use range::*;
|
||||
|
||||
@@ -66,7 +66,7 @@ impl Collector for DistributedAggregationCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
@@ -96,7 +96,7 @@ impl Collector for AggregationCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
@@ -145,7 +145,7 @@ impl AggregationSegmentCollector {
|
||||
/// reader. Also includes validation, e.g. checking field types and existence.
|
||||
pub fn from_agg_req_and_reader(
|
||||
agg: &Aggregations,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
context: &AggContextParams,
|
||||
) -> crate::Result<Self> {
|
||||
|
||||
@@ -15,8 +15,9 @@ use serde::{Deserialize, Serialize};
|
||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use super::agg_result::{AggregationResult, BucketResult, MetricResult, RangeBucketEntry};
|
||||
use super::bucket::{
|
||||
cut_off_buckets, get_agg_name_and_property, intermediate_histogram_buckets_to_final_buckets,
|
||||
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
|
||||
composite_intermediate_key_ordering, cut_off_buckets, get_agg_name_and_property,
|
||||
intermediate_histogram_buckets_to_final_buckets, CompositeAggregation, GetDocCount,
|
||||
MissingOrder, Order, OrderTarget, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||
@@ -25,7 +26,7 @@ use super::metric::{
|
||||
use super::segment_agg_result::AggregationLimitsGuard;
|
||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||
use crate::aggregation::agg_result::{
|
||||
AggregationResults, BucketEntries, BucketEntry, FilterBucketResult,
|
||||
AggregationResults, BucketEntries, BucketEntry, CompositeBucketEntry, FilterBucketResult,
|
||||
};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
use crate::aggregation::metric::CardinalityCollector;
|
||||
@@ -280,6 +281,11 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
doc_count: 0,
|
||||
sub_aggregations: IntermediateAggregationResults::default(),
|
||||
}),
|
||||
Composite(_) => {
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Composite {
|
||||
buckets: IntermediateCompositeBucketResult::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -473,6 +479,11 @@ pub enum IntermediateBucketResult {
|
||||
/// Sub-aggregation results
|
||||
sub_aggregations: IntermediateAggregationResults,
|
||||
},
|
||||
/// Composite aggregation
|
||||
Composite {
|
||||
/// The composite buckets
|
||||
buckets: IntermediateCompositeBucketResult,
|
||||
},
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
@@ -568,6 +579,13 @@ impl IntermediateBucketResult {
|
||||
sub_aggregations: final_sub_aggregations,
|
||||
}))
|
||||
}
|
||||
IntermediateBucketResult::Composite { buckets } => {
|
||||
let composite_req = req
|
||||
.agg
|
||||
.as_composite()
|
||||
.expect("unexpected aggregation, expected composite aggregation");
|
||||
buckets.into_final_result(composite_req, req.sub_aggregation(), limits)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -634,6 +652,16 @@ impl IntermediateBucketResult {
|
||||
*doc_count_left += doc_count_right;
|
||||
sub_aggs_left.merge_fruits(sub_aggs_right)?;
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Composite {
|
||||
buckets: composite_left,
|
||||
},
|
||||
IntermediateBucketResult::Composite {
|
||||
buckets: composite_right,
|
||||
},
|
||||
) => {
|
||||
composite_left.merge_fruits(composite_right)?;
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
@@ -646,6 +674,9 @@ impl IntermediateBucketResult {
|
||||
(IntermediateBucketResult::Filter { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Composite { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -914,6 +945,176 @@ impl MergeFruits for IntermediateHistogramBucketEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry for the composite bucket.
|
||||
pub type IntermediateCompositeBucketEntry = IntermediateTermBucketEntry;
|
||||
|
||||
/// The fully typed key for composite aggregation
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum CompositeIntermediateKey {
|
||||
/// Bool key
|
||||
Bool(bool),
|
||||
/// String key
|
||||
Str(String),
|
||||
/// Float key
|
||||
F64(f64),
|
||||
/// Signed integer key
|
||||
I64(i64),
|
||||
/// Unsigned integer key
|
||||
U64(u64),
|
||||
/// DateTime key, nanoseconds since epoch
|
||||
DateTime(i64),
|
||||
/// IP Address key
|
||||
IpAddr(Ipv6Addr),
|
||||
/// Missing value key
|
||||
Null,
|
||||
}
|
||||
|
||||
impl Eq for CompositeIntermediateKey {}
|
||||
|
||||
impl std::hash::Hash for CompositeIntermediateKey {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
core::mem::discriminant(self).hash(state);
|
||||
match self {
|
||||
CompositeIntermediateKey::Bool(val) => val.hash(state),
|
||||
CompositeIntermediateKey::Str(text) => text.hash(state),
|
||||
CompositeIntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||
CompositeIntermediateKey::U64(val) => val.hash(state),
|
||||
CompositeIntermediateKey::I64(val) => val.hash(state),
|
||||
CompositeIntermediateKey::DateTime(val) => val.hash(state),
|
||||
CompositeIntermediateKey::IpAddr(val) => val.hash(state),
|
||||
CompositeIntermediateKey::Null => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Composite aggregation page.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateCompositeBucketResult {
|
||||
pub(crate) entries: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry>,
|
||||
pub(crate) target_size: u32,
|
||||
pub(crate) orders: Vec<(Order, MissingOrder)>,
|
||||
}
|
||||
|
||||
impl IntermediateCompositeBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &CompositeAggregation,
|
||||
sub_aggregation_req: &Aggregations,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let trimmed_entry_vec =
|
||||
trim_composite_buckets(self.entries, &self.orders, self.target_size)?;
|
||||
let after_key = if trimmed_entry_vec.len() == req.size as usize {
|
||||
trimmed_entry_vec
|
||||
.last()
|
||||
.map(|bucket| {
|
||||
let (intermediate_key, _entry) = bucket;
|
||||
intermediate_key
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, intermediate_key)| {
|
||||
let source = &req.sources[idx];
|
||||
(source.name().to_string(), intermediate_key.clone().into())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap()
|
||||
} else {
|
||||
FxHashMap::default()
|
||||
};
|
||||
|
||||
let buckets = trimmed_entry_vec
|
||||
.into_iter()
|
||||
.map(|(intermediate_key, entry)| {
|
||||
let key = intermediate_key
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, intermediate_key)| {
|
||||
let source = &req.sources[idx];
|
||||
(source.name().to_string(), intermediate_key.into())
|
||||
})
|
||||
.collect();
|
||||
Ok(CompositeBucketEntry {
|
||||
key,
|
||||
doc_count: entry.doc_count as u64,
|
||||
sub_aggregation: entry
|
||||
.sub_aggregation
|
||||
.into_final_result_internal(sub_aggregation_req, limits)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(BucketResult::Composite { after_key, buckets })
|
||||
}
|
||||
|
||||
fn merge_fruits(&mut self, other: IntermediateCompositeBucketResult) -> crate::Result<()> {
|
||||
merge_maps(&mut self.entries, other.entries)?;
|
||||
if self.entries.len() as u32 > 2 * self.target_size {
|
||||
self.trim()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Trim the composite buckets to the target size, according to the ordering.
|
||||
pub(crate) fn trim(&mut self) -> crate::Result<()> {
|
||||
if self.entries.len() as u32 <= self.target_size {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let sorted_entries = trim_composite_buckets(
|
||||
std::mem::take(&mut self.entries),
|
||||
&self.orders,
|
||||
self.target_size,
|
||||
)?;
|
||||
|
||||
self.entries = sorted_entries.into_iter().collect();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn trim_composite_buckets(
|
||||
entries: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry>,
|
||||
orders: &[(Order, MissingOrder)],
|
||||
target_size: u32,
|
||||
) -> crate::Result<
|
||||
Vec<(
|
||||
Vec<CompositeIntermediateKey>,
|
||||
IntermediateCompositeBucketEntry,
|
||||
)>,
|
||||
> {
|
||||
let mut entries: Vec<_> = entries.into_iter().collect();
|
||||
let mut sort_error: Option<TantivyError> = None;
|
||||
entries.sort_by(|(left_key, _), (right_key, _)| {
|
||||
if sort_error.is_some() {
|
||||
return Ordering::Equal;
|
||||
}
|
||||
|
||||
for idx in 0..orders.len() {
|
||||
match composite_intermediate_key_ordering(
|
||||
&left_key[idx],
|
||||
&right_key[idx],
|
||||
orders[idx].0,
|
||||
orders[idx].1,
|
||||
) {
|
||||
Ok(ordering) if ordering != Ordering::Equal => return ordering,
|
||||
Ok(_) => continue,
|
||||
Err(err) => {
|
||||
sort_error = Some(err);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ordering::Equal
|
||||
});
|
||||
|
||||
if let Some(err) = sort_error {
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
entries.truncate(target_size as usize);
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{BuildHasher, Hasher};
|
||||
use std::hash::Hash;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{Column, ColumnType, Dictionary, StrColumn};
|
||||
use common::f64_to_u64;
|
||||
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
||||
use datasketches::hll::{HllSketch, HllType, HllUnion};
|
||||
use rustc_hash::FxHashSet;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
@@ -16,29 +15,17 @@ use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||
use crate::aggregation::*;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct BuildSaltedHasher {
|
||||
salt: u8,
|
||||
}
|
||||
|
||||
impl BuildHasher for BuildSaltedHasher {
|
||||
type Hasher = DefaultHasher;
|
||||
|
||||
fn build_hasher(&self) -> Self::Hasher {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
hasher.write_u8(self.salt);
|
||||
|
||||
hasher
|
||||
}
|
||||
}
|
||||
/// Log2 of the number of registers for the HLL sketch.
|
||||
/// 2^11 = 2048 registers, giving ~2.3% relative error and ~1KB per sketch (Hll4).
|
||||
const LG_K: u8 = 11;
|
||||
|
||||
/// # Cardinality
|
||||
///
|
||||
/// The cardinality aggregation allows for computing an estimate
|
||||
/// of the number of different values in a data set based on the
|
||||
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
|
||||
/// uniqueness of values in a large dataset where counting each unique value
|
||||
/// individually would be computationally expensive.
|
||||
/// Apache DataSketches HyperLogLog algorithm. This is particularly useful for
|
||||
/// understanding the uniqueness of values in a large dataset where counting
|
||||
/// each unique value individually would be computationally expensive.
|
||||
///
|
||||
/// For example, you might use a cardinality aggregation to estimate the number
|
||||
/// of unique visitors to a website by aggregating on a field that contains
|
||||
@@ -184,7 +171,7 @@ impl SegmentCardinalityCollectorBucket {
|
||||
|
||||
term_ids.sort_unstable();
|
||||
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
||||
self.cardinality.sketch.insert_any(&term);
|
||||
self.cardinality.insert(term);
|
||||
Ok(())
|
||||
})?;
|
||||
if has_missing {
|
||||
@@ -195,17 +182,17 @@ impl SegmentCardinalityCollectorBucket {
|
||||
);
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
self.cardinality.sketch.insert_any(&missing);
|
||||
self.cardinality.insert(missing.as_str());
|
||||
}
|
||||
Key::F64(val) => {
|
||||
let val = f64_to_u64(*val);
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
self.cardinality.insert(val);
|
||||
}
|
||||
Key::U64(val) => {
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
self.cardinality.insert(*val);
|
||||
}
|
||||
Key::I64(val) => {
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
self.cardinality.insert(*val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -296,11 +283,11 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
bucket.cardinality.sketch.insert_any(&val);
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
bucket.cardinality.sketch.insert_any(&val);
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -321,11 +308,18 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
/// The percentiles collector used during segment collection and for merging results.
|
||||
#[derive(Clone, Debug)]
|
||||
/// The cardinality collector used during segment collection and for merging results.
|
||||
/// Uses Apache DataSketches HLL (lg_k=11, Hll4) for compact binary serialization
|
||||
/// and cross-language compatibility (e.g. Java `datasketches` library).
|
||||
pub struct CardinalityCollector {
|
||||
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
|
||||
sketch: HllSketch,
|
||||
/// Salt derived from `ColumnType`, used to differentiate values of different column types
|
||||
/// that map to the same u64 (e.g. bool `false` = 0 vs i64 `0`).
|
||||
/// Not serialized — only needed during insertion, not after sketch registers are populated.
|
||||
salt: u8,
|
||||
}
|
||||
|
||||
impl Default for CardinalityCollector {
|
||||
fn default() -> Self {
|
||||
Self::new(0)
|
||||
@@ -338,25 +332,52 @@ impl PartialEq for CardinalityCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl CardinalityCollector {
|
||||
/// Compute the final cardinality estimate.
|
||||
pub fn finalize(self) -> Option<f64> {
|
||||
Some(self.sketch.clone().count().trunc())
|
||||
impl Serialize for CardinalityCollector {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
let bytes = self.sketch.serialize();
|
||||
serializer.serialize_bytes(&bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for CardinalityCollector {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
|
||||
let bytes: Vec<u8> = Deserialize::deserialize(deserializer)?;
|
||||
let sketch = HllSketch::deserialize(&bytes).map_err(serde::de::Error::custom)?;
|
||||
Ok(Self { sketch, salt: 0 })
|
||||
}
|
||||
}
|
||||
|
||||
impl CardinalityCollector {
|
||||
fn new(salt: u8) -> Self {
|
||||
Self {
|
||||
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
|
||||
sketch: HllSketch::new(LG_K, HllType::Hll4),
|
||||
salt,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
|
||||
self.sketch.merge(&right.sketch).map_err(|err| {
|
||||
TantivyError::AggregationError(AggregationError::InternalError(format!(
|
||||
"Error while merging cardinality {err:?}"
|
||||
)))
|
||||
})?;
|
||||
/// Insert a value into the HLL sketch, salted by the column type.
|
||||
/// The salt ensures that identical u64 values from different column types
|
||||
/// (e.g. bool `false` vs i64 `0`) are counted as distinct.
|
||||
pub(crate) fn insert<T: Hash>(&mut self, value: T) {
|
||||
self.sketch.update((self.salt, value));
|
||||
}
|
||||
|
||||
/// Compute the final cardinality estimate.
|
||||
pub fn finalize(self) -> Option<f64> {
|
||||
Some(self.sketch.estimate().trunc())
|
||||
}
|
||||
|
||||
/// Serialize the HLL sketch to its compact binary representation.
|
||||
/// The format is cross-language compatible with Apache DataSketches (Java, C++, Python).
|
||||
pub fn to_sketch_bytes(&self) -> Vec<u8> {
|
||||
self.sketch.serialize()
|
||||
}
|
||||
|
||||
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
|
||||
let mut union = HllUnion::new(LG_K);
|
||||
union.update(&self.sketch);
|
||||
union.update(&right.sketch);
|
||||
self.sketch = union.get_result(HllType::Hll4);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -518,4 +539,75 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_serde_roundtrip() {
|
||||
use super::CardinalityCollector;
|
||||
|
||||
let mut collector = CardinalityCollector::default();
|
||||
collector.insert("hello");
|
||||
collector.insert("world");
|
||||
collector.insert("hello"); // duplicate
|
||||
|
||||
let serialized = serde_json::to_vec(&collector).unwrap();
|
||||
let deserialized: CardinalityCollector = serde_json::from_slice(&serialized).unwrap();
|
||||
|
||||
let original_estimate = collector.finalize().unwrap();
|
||||
let roundtrip_estimate = deserialized.finalize().unwrap();
|
||||
assert_eq!(original_estimate, roundtrip_estimate);
|
||||
assert_eq!(original_estimate, 2.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_merge() {
|
||||
use super::CardinalityCollector;
|
||||
|
||||
let mut left = CardinalityCollector::default();
|
||||
left.insert("a");
|
||||
left.insert("b");
|
||||
|
||||
let mut right = CardinalityCollector::default();
|
||||
right.insert("b");
|
||||
right.insert("c");
|
||||
|
||||
left.merge_fruits(right).unwrap();
|
||||
let estimate = left.finalize().unwrap();
|
||||
assert_eq!(estimate, 3.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_serialize_deserialize_binary() {
|
||||
use datasketches::hll::HllSketch;
|
||||
|
||||
use super::CardinalityCollector;
|
||||
|
||||
let mut collector = CardinalityCollector::default();
|
||||
collector.insert("apple");
|
||||
collector.insert("banana");
|
||||
collector.insert("cherry");
|
||||
|
||||
let bytes = collector.to_sketch_bytes();
|
||||
let deserialized = HllSketch::deserialize(&bytes).unwrap();
|
||||
assert!((deserialized.estimate() - 3.0).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_salt_differentiates_types() {
|
||||
use super::CardinalityCollector;
|
||||
|
||||
// Without salt, same u64 value from different column types would collide
|
||||
let mut collector_bool = CardinalityCollector::new(5); // e.g. ColumnType::Bool
|
||||
collector_bool.insert(0u64); // false
|
||||
collector_bool.insert(1u64); // true
|
||||
|
||||
let mut collector_i64 = CardinalityCollector::new(2); // e.g. ColumnType::I64
|
||||
collector_i64.insert(0u64);
|
||||
collector_i64.insert(1u64);
|
||||
|
||||
// Merge them
|
||||
collector_bool.merge_fruits(collector_i64).unwrap();
|
||||
let estimate = collector_bool.finalize().unwrap();
|
||||
// Should be 4 because salt makes (5, 0) != (2, 0) and (5, 1) != (2, 1)
|
||||
assert_eq!(estimate, 4.0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -222,6 +222,12 @@ impl PercentilesCollector {
|
||||
self.sketch.add(val);
|
||||
}
|
||||
|
||||
/// Encode the underlying DDSketch to Java-compatible binary format
|
||||
/// for cross-language serialization with Java consumers.
|
||||
pub fn to_sketch_bytes(&self) -> Vec<u8> {
|
||||
self.sketch.to_java_bytes()
|
||||
}
|
||||
|
||||
pub(crate) fn merge_fruits(&mut self, right: PercentilesCollector) -> crate::Result<()> {
|
||||
self.sketch.merge(&right.sketch).map_err(|err| {
|
||||
TantivyError::AggregationError(AggregationError::InternalError(format!(
|
||||
@@ -325,7 +331,7 @@ mod tests {
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::Index;
|
||||
use crate::{assert_nearly_equals, Index};
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_percentiles_empty_index() -> crate::Result<()> {
|
||||
@@ -608,12 +614,16 @@ mod tests {
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
assert_eq!(res["range_with_stats"]["buckets"][0]["doc_count"], 3);
|
||||
|
||||
assert_eq!(
|
||||
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["1.0"],
|
||||
assert_nearly_equals!(
|
||||
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["1.0"]
|
||||
.as_f64()
|
||||
.unwrap(),
|
||||
5.0028295751107414
|
||||
);
|
||||
assert_eq!(
|
||||
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["99.0"],
|
||||
assert_nearly_equals!(
|
||||
res["range_with_stats"]["buckets"][0]["percentiles"]["values"]["99.0"]
|
||||
.as_f64()
|
||||
.unwrap(),
|
||||
10.07469668951144
|
||||
);
|
||||
|
||||
@@ -659,8 +669,14 @@ mod tests {
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(res["percentiles"]["values"]["1.0"], 5.0028295751107414);
|
||||
assert_eq!(res["percentiles"]["values"]["99.0"], 10.07469668951144);
|
||||
assert_nearly_equals!(
|
||||
res["percentiles"]["values"]["1.0"].as_f64().unwrap(),
|
||||
5.0028295751107414
|
||||
);
|
||||
assert_nearly_equals!(
|
||||
res["percentiles"]["values"]["99.0"].as_f64().unwrap(),
|
||||
10.07469668951144
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
170
src/codec/mod.rs
170
src/codec/mod.rs
@@ -1,170 +0,0 @@
|
||||
/// Codec specific to postings data.
|
||||
pub mod postings;
|
||||
|
||||
/// Standard tantivy codec. This is the codec you use by default.
|
||||
pub mod standard;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use standard::StandardCodec;
|
||||
|
||||
use crate::codec::postings::PostingsCodec;
|
||||
use crate::directory::Directory;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{box_scorer, BufferedUnionScorer, Scorer, SumCombiner};
|
||||
use crate::schema::Schema;
|
||||
use crate::{DocId, Score, SegmentMeta, SegmentReader, TantivySegmentReader};
|
||||
|
||||
/// Codecs describes how data is layed out on disk.
|
||||
///
|
||||
/// For the moment, only postings codec can be custom.
|
||||
pub trait Codec: Clone + std::fmt::Debug + Send + Sync + 'static {
|
||||
/// The specific postings type used by this codec.
|
||||
type PostingsCodec: PostingsCodec;
|
||||
|
||||
/// ID of the codec. It should be unique to your codec.
|
||||
/// Make it human-readable, descriptive, short and unique.
|
||||
const ID: &'static str;
|
||||
|
||||
/// Load codec based on the codec configuration.
|
||||
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self>;
|
||||
|
||||
/// Get codec configuration.
|
||||
fn to_json_props(&self) -> serde_json::Value;
|
||||
|
||||
/// Returns the postings codec.
|
||||
fn postings_codec(&self) -> &Self::PostingsCodec;
|
||||
|
||||
/// Loads postings using the codec's concrete postings type.
|
||||
fn load_postings_typed(
|
||||
&self,
|
||||
reader: &dyn crate::index::InvertedIndexReader,
|
||||
term_info: &crate::postings::TermInfo,
|
||||
option: crate::schema::IndexRecordOption,
|
||||
) -> std::io::Result<<Self::PostingsCodec as crate::codec::postings::PostingsCodec>::Postings>
|
||||
{
|
||||
let postings_data = reader.read_raw_postings_data(term_info, option)?;
|
||||
self.postings_codec()
|
||||
.load_postings(term_info.doc_freq, postings_data)
|
||||
}
|
||||
|
||||
/// Opens a segment reader using this codec.
|
||||
///
|
||||
/// Override this if your codec uses a custom segment reader implementation.
|
||||
fn open_segment_reader(
|
||||
&self,
|
||||
directory: &dyn Directory,
|
||||
segment_meta: &SegmentMeta,
|
||||
schema: Schema,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<Arc<dyn SegmentReader>> {
|
||||
let codec: Arc<dyn ObjectSafeCodec> = Arc::new(self.clone());
|
||||
let reader = TantivySegmentReader::open_with_custom_alive_set_from_directory(
|
||||
directory,
|
||||
segment_meta,
|
||||
schema,
|
||||
codec,
|
||||
custom_bitset,
|
||||
)?;
|
||||
Ok(Arc::new(reader))
|
||||
}
|
||||
}
|
||||
|
||||
/// Object-safe codec is a Codec that can be used in a trait object.
|
||||
///
|
||||
/// The point of it is to offer a way to use a codec without a proliferation of generics.
|
||||
pub trait ObjectSafeCodec: 'static + Send + Sync {
|
||||
/// Performs a for_each_pruning operation on the given scorer.
|
||||
///
|
||||
/// The function will go through matching documents and call the callback
|
||||
/// function for all docs with a score exceeding the threshold.
|
||||
///
|
||||
/// The function itself will return a larger threshold value,
|
||||
/// meant to update the threshold value.
|
||||
///
|
||||
/// If the codec and the scorer allow it, this function can rely on
|
||||
/// optimizations like the block-max wand.
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: Score,
|
||||
scorer: Box<dyn Scorer>,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
);
|
||||
|
||||
/// Builds a union scorer possibly specialized if
|
||||
/// all scorers are `Term<Self::Postings>`.
|
||||
fn build_union_scorer_with_sum_combiner(
|
||||
&self,
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
num_docs: DocId,
|
||||
score_combiner_type: SumOrDoNothingCombiner,
|
||||
) -> Box<dyn Scorer>;
|
||||
}
|
||||
|
||||
impl<TCodec: Codec> ObjectSafeCodec for TCodec {
|
||||
fn build_union_scorer_with_sum_combiner(
|
||||
&self,
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
num_docs: DocId,
|
||||
sum_or_do_nothing_combiner: SumOrDoNothingCombiner,
|
||||
) -> Box<dyn Scorer> {
|
||||
if !scorers.iter().all(|scorer| {
|
||||
scorer.is::<TermScorer<<<Self as Codec>::PostingsCodec as PostingsCodec>::Postings>>()
|
||||
}) {
|
||||
return box_scorer(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
SumCombiner::default,
|
||||
num_docs,
|
||||
));
|
||||
}
|
||||
let specialized_scorers: Vec<
|
||||
TermScorer<<<Self as Codec>::PostingsCodec as PostingsCodec>::Postings>,
|
||||
> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| {
|
||||
*scorer.downcast::<TermScorer<_>>().ok().expect(
|
||||
"Downcast failed despite the fact we already checked the type was correct",
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
match sum_or_do_nothing_combiner {
|
||||
SumOrDoNothingCombiner::Sum => box_scorer(BufferedUnionScorer::build(
|
||||
specialized_scorers,
|
||||
SumCombiner::default,
|
||||
num_docs,
|
||||
)),
|
||||
SumOrDoNothingCombiner::DoNothing => box_scorer(BufferedUnionScorer::build(
|
||||
specialized_scorers,
|
||||
DoNothingCombiner::default,
|
||||
num_docs,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: Score,
|
||||
scorer: Box<dyn Scorer>,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
let accerelerated_foreach_pruning_res =
|
||||
<TCodec as Codec>::PostingsCodec::try_accelerated_for_each_pruning(
|
||||
threshold, scorer, callback,
|
||||
);
|
||||
if let Err(mut scorer) = accerelerated_foreach_pruning_res {
|
||||
// No acceleration available. We need to do things manually.
|
||||
scorer.for_each_pruning(threshold, callback);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SumCombiner or DoNothingCombiner
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum SumOrDoNothingCombiner {
|
||||
/// Sum scores together
|
||||
Sum,
|
||||
/// Do not track any score.
|
||||
DoNothing,
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
/// Block-max WAND algorithm.
|
||||
pub mod block_wand;
|
||||
use std::io;
|
||||
|
||||
use common::OwnedBytes;
|
||||
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Postings codec (read path).
|
||||
pub trait PostingsCodec: Send + Sync + 'static {
|
||||
/// Postings type for the postings codec.
|
||||
type Postings: Postings + Clone;
|
||||
|
||||
/// Load postings from raw bytes and metadata.
|
||||
fn load_postings(
|
||||
&self,
|
||||
doc_freq: u32,
|
||||
postings_data: RawPostingsData,
|
||||
) -> io::Result<Self::Postings>;
|
||||
|
||||
/// If your codec supports different ways to accelerate `for_each_pruning` that's
|
||||
/// where you should implement it.
|
||||
///
|
||||
/// Returning `Err(scorer)` without mutating the scorer nor calling the callback function,
|
||||
/// is never "wrong". It just leaves the responsability to the caller to call a fallback
|
||||
/// implementation on the scorer.
|
||||
///
|
||||
/// If your codec supports BlockMax-Wand, you just need to have your
|
||||
/// postings implement `PostingsWithBlockMax` and copy what is done in the StandardPostings
|
||||
/// codec to enable it.
|
||||
fn try_accelerated_for_each_pruning(
|
||||
_threshold: Score,
|
||||
scorer: Box<dyn Scorer>,
|
||||
_callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> Result<(), Box<dyn Scorer>> {
|
||||
Err(scorer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw postings bytes and metadata read from storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RawPostingsData {
|
||||
/// Raw postings bytes for the term.
|
||||
pub postings_data: OwnedBytes,
|
||||
/// Raw positions bytes for the term, if positions are available.
|
||||
pub positions_data: Option<OwnedBytes>,
|
||||
/// Record option of the indexed field.
|
||||
pub record_option: IndexRecordOption,
|
||||
/// Effective record option after downgrading to the indexed field capability.
|
||||
pub effective_option: IndexRecordOption,
|
||||
}
|
||||
|
||||
/// A light complement interface to Postings to allow block-max wand acceleration.
|
||||
pub trait PostingsWithBlockMax: Postings {
|
||||
/// Moves the postings to the block containign `target_doc` and returns
|
||||
/// an upperbound of the score for documents in the block.
|
||||
///
|
||||
/// `Warning`: Calling this method may leave the postings in an invalid state.
|
||||
/// callers are required to call seek before calling any other of the
|
||||
/// `Postings` method (like doc / advance etc.).
|
||||
fn seek_block_max(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
similarity_weight: &Bm25Weight,
|
||||
) -> Score;
|
||||
|
||||
/// Returns the last document in the current block (or Terminated if this
|
||||
/// is the last block).
|
||||
fn last_doc_in_block(&self) -> crate::DocId;
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::codec::standard::postings::StandardPostingsCodec;
|
||||
use crate::codec::Codec;
|
||||
|
||||
/// Tantivy's default postings codec.
|
||||
pub mod postings;
|
||||
|
||||
/// Tantivy's default codec.
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct StandardCodec;
|
||||
|
||||
impl Codec for StandardCodec {
|
||||
type PostingsCodec = StandardPostingsCodec;
|
||||
|
||||
const ID: &'static str = "tantivy-default";
|
||||
|
||||
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self> {
|
||||
if !json_value.is_null() {
|
||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Codec property for the StandardCodec are unexpected. expected null, got {}",
|
||||
json_value.as_str().unwrap_or("null")
|
||||
)));
|
||||
}
|
||||
Ok(StandardCodec)
|
||||
}
|
||||
|
||||
fn to_json_props(&self) -> serde_json::Value {
|
||||
serde_json::Value::Null
|
||||
}
|
||||
|
||||
fn postings_codec(&self) -> &Self::PostingsCodec {
|
||||
&StandardPostingsCodec
|
||||
}
|
||||
}
|
||||
@@ -1,171 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
use common::BitSet;
|
||||
|
||||
use crate::codec::postings::block_wand::{block_wand, block_wand_single_scorer};
|
||||
use crate::codec::postings::{PostingsCodec, RawPostingsData};
|
||||
use crate::codec::standard::postings::block_segment_postings::BlockSegmentPostings;
|
||||
pub use crate::codec::standard::postings::segment_postings::SegmentPostings;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{BufferedUnionScorer, Scorer, SumCombiner};
|
||||
use crate::{DocSet as _, Score, TERMINATED};
|
||||
|
||||
mod block_segment_postings;
|
||||
mod segment_postings;
|
||||
|
||||
pub use segment_postings::SegmentPostings as StandardPostings;
|
||||
|
||||
/// The default postings codec for tantivy.
|
||||
pub struct StandardPostingsCodec;
|
||||
|
||||
#[expect(clippy::enum_variant_names)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
|
||||
pub(crate) enum FreqReadingOption {
|
||||
NoFreq,
|
||||
SkipFreq,
|
||||
ReadFreq,
|
||||
}
|
||||
|
||||
impl PostingsCodec for StandardPostingsCodec {
|
||||
type Postings = SegmentPostings;
|
||||
|
||||
fn load_postings(
|
||||
&self,
|
||||
doc_freq: u32,
|
||||
postings_data: RawPostingsData,
|
||||
) -> io::Result<Self::Postings> {
|
||||
load_postings_from_raw_data(doc_freq, postings_data)
|
||||
}
|
||||
|
||||
fn try_accelerated_for_each_pruning(
|
||||
mut threshold: Score,
|
||||
mut scorer: Box<dyn Scorer>,
|
||||
callback: &mut dyn FnMut(crate::DocId, Score) -> Score,
|
||||
) -> Result<(), Box<dyn Scorer>> {
|
||||
scorer = match scorer.downcast::<TermScorer<Self::Postings>>() {
|
||||
Ok(term_scorer) => {
|
||||
block_wand_single_scorer(*term_scorer, threshold, callback);
|
||||
return Ok(());
|
||||
}
|
||||
Err(scorer) => scorer,
|
||||
};
|
||||
let mut union_scorer =
|
||||
scorer.downcast::<BufferedUnionScorer<TermScorer<Self::Postings>, SumCombiner>>()?;
|
||||
let doc = union_scorer.doc();
|
||||
if doc == TERMINATED {
|
||||
return Ok(());
|
||||
}
|
||||
let score = union_scorer.score();
|
||||
if score > threshold {
|
||||
threshold = callback(doc, score);
|
||||
}
|
||||
let scorers: Vec<TermScorer<Self::Postings>> = union_scorer.into_scorers();
|
||||
block_wand(scorers, threshold, callback);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
pub(crate) fn load_postings_from_raw_data(
|
||||
doc_freq: u32,
|
||||
postings_data: RawPostingsData,
|
||||
) -> io::Result<SegmentPostings> {
|
||||
let RawPostingsData {
|
||||
postings_data,
|
||||
positions_data: positions_data_opt,
|
||||
record_option,
|
||||
effective_option,
|
||||
} = postings_data;
|
||||
let requested_option = effective_option;
|
||||
let block_segment_postings =
|
||||
BlockSegmentPostings::open(doc_freq, postings_data, record_option, requested_option)?;
|
||||
let position_reader = positions_data_opt.map(PositionReader::open).transpose()?;
|
||||
Ok(SegmentPostings::from_block_postings(
|
||||
block_segment_postings,
|
||||
position_reader,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn fill_bitset_from_raw_data(
|
||||
doc_freq: u32,
|
||||
postings_data: RawPostingsData,
|
||||
doc_bitset: &mut BitSet,
|
||||
) -> io::Result<()> {
|
||||
let RawPostingsData {
|
||||
postings_data,
|
||||
record_option,
|
||||
effective_option,
|
||||
..
|
||||
} = postings_data;
|
||||
let mut block_postings =
|
||||
BlockSegmentPostings::open(doc_freq, postings_data, record_option, effective_option)?;
|
||||
loop {
|
||||
let docs = block_postings.docs();
|
||||
if docs.is_empty() {
|
||||
break;
|
||||
}
|
||||
for &doc in docs {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
block_postings.advance();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::*;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::postings::Postings as _;
|
||||
use crate::schema::IndexRecordOption;
|
||||
|
||||
fn test_segment_postings_tf_aux(num_docs: u32, include_term_freq: bool) -> SegmentPostings {
|
||||
let mut postings_serializer =
|
||||
PostingsSerializer::new(1.0f32, IndexRecordOption::WithFreqs, None);
|
||||
let mut buffer = Vec::new();
|
||||
postings_serializer.new_term(num_docs, include_term_freq);
|
||||
for i in 0..num_docs {
|
||||
postings_serializer.write_doc(i, 2);
|
||||
}
|
||||
postings_serializer
|
||||
.close_term(num_docs, &mut buffer)
|
||||
.unwrap();
|
||||
load_postings_from_raw_data(
|
||||
num_docs,
|
||||
RawPostingsData {
|
||||
postings_data: OwnedBytes::new(buffer),
|
||||
positions_data: None,
|
||||
record_option: IndexRecordOption::WithFreqs,
|
||||
effective_option: IndexRecordOption::WithFreqs,
|
||||
},
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_postings_small_block_with_and_without_freq() {
|
||||
let small_block_without_term_freq = test_segment_postings_tf_aux(1, false);
|
||||
assert!(!small_block_without_term_freq.has_freq());
|
||||
assert_eq!(small_block_without_term_freq.doc(), 0);
|
||||
assert_eq!(small_block_without_term_freq.term_freq(), 1);
|
||||
|
||||
let small_block_with_term_freq = test_segment_postings_tf_aux(1, true);
|
||||
assert!(small_block_with_term_freq.has_freq());
|
||||
assert_eq!(small_block_with_term_freq.doc(), 0);
|
||||
assert_eq!(small_block_with_term_freq.term_freq(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_postings_large_block_with_and_without_freq() {
|
||||
let large_block_without_term_freq = test_segment_postings_tf_aux(128, false);
|
||||
assert!(!large_block_without_term_freq.has_freq());
|
||||
assert_eq!(large_block_without_term_freq.doc(), 0);
|
||||
assert_eq!(large_block_without_term_freq.term_freq(), 1);
|
||||
|
||||
let large_block_with_term_freq = test_segment_postings_tf_aux(128, true);
|
||||
assert!(large_block_with_term_freq.has_freq());
|
||||
assert_eq!(large_block_with_term_freq.doc(), 0);
|
||||
assert_eq!(large_block_with_term_freq.term_freq(), 2);
|
||||
}
|
||||
}
|
||||
@@ -43,7 +43,7 @@ impl Collector for Count {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentOrdinal,
|
||||
_: &dyn SegmentReader,
|
||||
_: &SegmentReader,
|
||||
) -> crate::Result<SegmentCountCollector> {
|
||||
Ok(SegmentCountCollector::default())
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::{Collector, SegmentCollector};
|
||||
use crate::{DocAddress, DocId, Score, SegmentReader};
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
/// Collectors that returns the set of DocAddress that matches the query.
|
||||
///
|
||||
@@ -15,7 +15,7 @@ impl Collector for DocSetCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
_segment: &dyn SegmentReader,
|
||||
_segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok(DocSetChildCollector {
|
||||
segment_local_id,
|
||||
|
||||
@@ -265,7 +265,7 @@ impl Collector for FacetCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentOrdinal,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(&self.field_name)?;
|
||||
let facet_dict = facet_reader.facet_dict();
|
||||
|
||||
@@ -113,7 +113,7 @@ where
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let column_opt = segment_reader.fast_fields().column_opt(&self.field)?;
|
||||
|
||||
@@ -287,7 +287,7 @@ where
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use fastdivide::DividerU64;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, Score, SegmentReader};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Histogram builds an histogram of the values of a fastfield for the
|
||||
/// collected DocSet.
|
||||
@@ -110,7 +110,7 @@ impl Collector for HistogramCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
|
||||
let (column, _column_type) = column_opt.ok_or_else(|| FastFieldNotAvailableError {
|
||||
|
||||
@@ -156,7 +156,7 @@ pub trait Collector: Sync + Send {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child>;
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
@@ -174,7 +174,7 @@ pub trait Collector: Sync + Send {
|
||||
&self,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let with_scoring = self.requires_scoring();
|
||||
let mut segment_collector = self.for_segment(segment_ord, reader)?;
|
||||
@@ -186,7 +186,7 @@ pub trait Collector: Sync + Send {
|
||||
pub(crate) fn default_collect_segment_impl<TSegmentCollector: SegmentCollector>(
|
||||
segment_collector: &mut TSegmentCollector,
|
||||
weight: &dyn Weight,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
with_scoring: bool,
|
||||
) -> crate::Result<()> {
|
||||
match (reader.alive_bitset(), with_scoring) {
|
||||
@@ -255,7 +255,7 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok(if let Some(inner) = self {
|
||||
let inner_segment_collector = inner.for_segment(segment_local_id, segment)?;
|
||||
@@ -336,7 +336,7 @@ where
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let left = self.0.for_segment(segment_local_id, segment)?;
|
||||
let right = self.1.for_segment(segment_local_id, segment)?;
|
||||
@@ -407,7 +407,7 @@ where
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||
@@ -487,7 +487,7 @@ where
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||
|
||||
@@ -24,7 +24,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<Box<dyn BoxableSegmentCollector>> {
|
||||
let child = self.0.for_segment(segment_local_id, reader)?;
|
||||
Ok(Box::new(SegmentCollectorWrapper(child)))
|
||||
@@ -209,7 +209,7 @@ impl Collector for MultiCollector<'_> {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &dyn SegmentReader,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<MultiCollectorChild> {
|
||||
let children = self
|
||||
.collector_wrappers
|
||||
|
||||
@@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
||||
use crate::schema::{OwnedValue, Schema};
|
||||
use crate::{DocId, Order, Score, SegmentReader};
|
||||
use crate::{DocId, Order, Score};
|
||||
|
||||
fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
|
||||
match (lhs, rhs) {
|
||||
@@ -430,7 +430,7 @@ where
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let child = self.0.segment_sort_key_computer(segment_reader)?;
|
||||
Ok(SegmentSortKeyComputerWithComparator {
|
||||
@@ -468,7 +468,7 @@ where
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let child = self.0.segment_sort_key_computer(segment_reader)?;
|
||||
Ok(SegmentSortKeyComputerWithComparator {
|
||||
|
||||
@@ -32,7 +32,7 @@ impl SortKeyComputer for SortByBytes {
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn crate::SegmentReader,
|
||||
segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let bytes_column_opt = segment_reader.fast_fields().bytes(&self.column_name)?;
|
||||
Ok(ByBytesColumnSegmentSortKeyComputer { bytes_column_opt })
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::collector::sort_key::{
|
||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::schema::OwnedValue;
|
||||
use crate::{DateTime, DocId, Score, SegmentReader};
|
||||
use crate::{DateTime, DocId, Score};
|
||||
|
||||
/// Sort by the boxed / OwnedValue representation of either a fast field, or of the score.
|
||||
///
|
||||
@@ -86,7 +86,7 @@ impl SortKeyComputer for SortByErasedType {
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let inner: Box<dyn ErasedSegmentSortKeyComputer> = match self {
|
||||
Self::Field(column_name) => {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::collector::sort_key::NaturalComparator;
|
||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
|
||||
use crate::{DocAddress, DocId, Score, SegmentReader};
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
/// Sort by similarity score.
|
||||
#[derive(Clone, Debug, Copy)]
|
||||
@@ -19,7 +19,7 @@ impl SortKeyComputer for SortBySimilarityScore {
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
_segment_reader: &dyn SegmentReader,
|
||||
_segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok(SortBySimilarityScore)
|
||||
}
|
||||
@@ -29,7 +29,7 @@ impl SortKeyComputer for SortBySimilarityScore {
|
||||
&self,
|
||||
k: usize,
|
||||
weight: &dyn crate::query::Weight,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &crate::SegmentReader,
|
||||
segment_ord: u32,
|
||||
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
|
||||
let mut top_n: TopNComputer<Score, DocId, Self::Comparator> =
|
||||
|
||||
@@ -61,7 +61,7 @@ impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
|
||||
let (sort_column, _sort_column_type) =
|
||||
|
||||
@@ -3,7 +3,7 @@ use columnar::StrColumn;
|
||||
use crate::collector::sort_key::NaturalComparator;
|
||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::{DocId, Score, SegmentReader};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Sort by the first value of a string column.
|
||||
///
|
||||
@@ -35,7 +35,7 @@ impl SortKeyComputer for SortByString {
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?;
|
||||
Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt })
|
||||
|
||||
@@ -119,7 +119,7 @@ pub trait SortKeyComputer: Sync {
|
||||
&self,
|
||||
k: usize,
|
||||
weight: &dyn crate::query::Weight,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &crate::SegmentReader,
|
||||
segment_ord: u32,
|
||||
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
|
||||
let with_scoring = self.requires_scoring();
|
||||
@@ -135,7 +135,7 @@ pub trait SortKeyComputer: Sync {
|
||||
}
|
||||
|
||||
/// Builds a child sort key computer for a specific segment.
|
||||
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child>;
|
||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
|
||||
}
|
||||
|
||||
impl<HeadSortKeyComputer, TailSortKeyComputer> SortKeyComputer
|
||||
@@ -156,7 +156,7 @@ where
|
||||
(self.0.comparator(), self.1.comparator())
|
||||
}
|
||||
|
||||
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
|
||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
Ok((
|
||||
self.0.segment_sort_key_computer(segment_reader)?,
|
||||
self.1.segment_sort_key_computer(segment_reader)?,
|
||||
@@ -357,7 +357,7 @@ where
|
||||
)
|
||||
}
|
||||
|
||||
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
|
||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
|
||||
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
|
||||
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
|
||||
@@ -420,7 +420,7 @@ where
|
||||
SortKeyComputer4::Comparator,
|
||||
);
|
||||
|
||||
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
|
||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
|
||||
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
|
||||
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
|
||||
@@ -454,7 +454,7 @@ where
|
||||
|
||||
impl<F, SegmentF, TSortKey> SortKeyComputer for F
|
||||
where
|
||||
F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> SegmentF,
|
||||
F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF,
|
||||
SegmentF: 'static + FnMut(DocId) -> TSortKey,
|
||||
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
|
||||
{
|
||||
@@ -462,7 +462,7 @@ where
|
||||
type Child = SegmentF;
|
||||
type Comparator = NaturalComparator;
|
||||
|
||||
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
|
||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
Ok((self)(segment_reader))
|
||||
}
|
||||
}
|
||||
@@ -509,10 +509,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_lazy_score_computer() {
|
||||
let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32;
|
||||
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
|
||||
let call_count = Arc::new(AtomicUsize::new(0));
|
||||
let call_count_clone = call_count.clone();
|
||||
let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| {
|
||||
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
|
||||
let call_count_new_clone = call_count_clone.clone();
|
||||
move |_doc: DocId| {
|
||||
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
|
||||
@@ -572,10 +572,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_lazy_score_computer_dynamic_ordering() {
|
||||
let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32;
|
||||
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
|
||||
let call_count = Arc::new(AtomicUsize::new(0));
|
||||
let call_count_clone = call_count.clone();
|
||||
let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| {
|
||||
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
|
||||
let call_count_new_clone = call_count_clone.clone();
|
||||
move |_doc: DocId| {
|
||||
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
|
||||
|
||||
@@ -32,11 +32,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
|
||||
self.sort_key_computer.check_schema(schema)
|
||||
}
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_ord: u32,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
) -> Result<Self::Child> {
|
||||
fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
let segment_sort_key_computer = self
|
||||
.sort_key_computer
|
||||
.segment_sort_key_computer(segment_reader)?;
|
||||
@@ -67,7 +63,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
|
||||
&self,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<Vec<(TSortKeyComputer::SortKey, DocAddress)>> {
|
||||
let k = self.doc_range.end;
|
||||
let docs = self
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, Index, Searcher, SegmentReader, TantivyDocument};
|
||||
use crate::{DateTime, DocAddress, Index, Searcher, TantivyDocument};
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
@@ -109,7 +109,7 @@ impl Collector for TestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_id: SegmentOrdinal,
|
||||
_reader: &dyn SegmentReader,
|
||||
_reader: &SegmentReader,
|
||||
) -> crate::Result<TestSegmentCollector> {
|
||||
Ok(TestSegmentCollector {
|
||||
segment_id,
|
||||
@@ -180,7 +180,7 @@ impl Collector for FastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentOrdinal,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<FastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
.fast_fields()
|
||||
@@ -243,7 +243,7 @@ impl Collector for BytesFastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<BytesFastFieldSegmentCollector> {
|
||||
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
|
||||
@@ -393,7 +393,7 @@ impl TopDocs {
|
||||
/// // This is where we build our collector with our custom score.
|
||||
/// let top_docs_by_custom_score = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .tweak_score(move |segment_reader: &dyn SegmentReader| {
|
||||
/// .tweak_score(move |segment_reader: &SegmentReader| {
|
||||
/// // The argument is a function that returns our scoring
|
||||
/// // function.
|
||||
/// //
|
||||
@@ -442,7 +442,7 @@ pub struct TweakScoreFn<F>(F);
|
||||
|
||||
impl<F, TTweakScoreSortKeyFn, TSortKey> SortKeyComputer for TweakScoreFn<F>
|
||||
where
|
||||
F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> TTweakScoreSortKeyFn,
|
||||
F: 'static + Send + Sync + Fn(&SegmentReader) -> TTweakScoreSortKeyFn,
|
||||
TTweakScoreSortKeyFn: 'static + Fn(DocId, Score) -> TSortKey,
|
||||
TweakScoreSegmentSortKeyComputer<TTweakScoreSortKeyFn>:
|
||||
SegmentSortKeyComputer<SortKey = TSortKey, SegmentSortKey = TSortKey>,
|
||||
@@ -458,7 +458,7 @@ where
|
||||
|
||||
fn segment_sort_key_computer(
|
||||
&self,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok({
|
||||
TweakScoreSegmentSortKeyComputer {
|
||||
@@ -1525,7 +1525,7 @@ mod tests {
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let collector = TopDocs::with_limit(2)
|
||||
.and_offset(1)
|
||||
.order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc);
|
||||
.order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
|
||||
let score_docs: Vec<(u32, DocAddress)> =
|
||||
index.reader()?.searcher().search(&text_query, &collector)?;
|
||||
assert_eq!(
|
||||
@@ -1543,7 +1543,7 @@ mod tests {
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let collector = TopDocs::with_limit(2)
|
||||
.and_offset(1)
|
||||
.order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc);
|
||||
.order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
|
||||
@@ -4,7 +4,7 @@ use common::{replace_in_place, JsonPathWriter};
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::indexer::indexing_term::IndexingTerm;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter as _, PostingsWriterEnum};
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -80,7 +80,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
term_buffer: &mut IndexingTerm,
|
||||
json_path_writer: &mut JsonPathWriter,
|
||||
postings_writer: &mut PostingsWriterEnum,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
@@ -110,7 +110,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
term_buffer: &mut IndexingTerm,
|
||||
json_path_writer: &mut JsonPathWriter,
|
||||
postings_writer: &mut PostingsWriterEnum,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::path::Path;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::searcher::{Searcher, SearcherContext, SearcherGeneration};
|
||||
pub use self::searcher::{Searcher, SearcherGeneration};
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
|
||||
@@ -4,13 +4,13 @@ use std::{fmt, io};
|
||||
|
||||
use crate::collector::Collector;
|
||||
use crate::core::Executor;
|
||||
use crate::index::{Index, SegmentId, SegmentReader};
|
||||
use crate::index::{SegmentId, SegmentReader};
|
||||
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
|
||||
use crate::schema::{Field, FieldType, Schema, TantivyDocument, Term};
|
||||
use crate::schema::document::DocumentDeserialize;
|
||||
use crate::schema::{Schema, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::{CacheStats, StoreReader, DOCSTORE_CACHE_CAPACITY};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DocAddress, Inventory, Opstamp, TantivyError, TrackedObject};
|
||||
use crate::store::{CacheStats, StoreReader};
|
||||
use crate::{DocAddress, Index, Opstamp, TrackedObject};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [`Searcher`].
|
||||
///
|
||||
@@ -36,7 +36,7 @@ pub struct SearcherGeneration {
|
||||
|
||||
impl SearcherGeneration {
|
||||
pub(crate) fn from_segment_readers(
|
||||
segment_readers: &[Arc<dyn SegmentReader>],
|
||||
segment_readers: &[SegmentReader],
|
||||
generation_id: u64,
|
||||
) -> Self {
|
||||
let mut segment_id_to_del_opstamp = BTreeMap::new();
|
||||
@@ -61,103 +61,6 @@ impl SearcherGeneration {
|
||||
}
|
||||
}
|
||||
|
||||
/// Search-time context required by a [`Searcher`].
|
||||
#[derive(Clone)]
|
||||
pub struct SearcherContext {
|
||||
schema: Schema,
|
||||
executor: Executor,
|
||||
tokenizers: TokenizerManager,
|
||||
fast_field_tokenizers: TokenizerManager,
|
||||
}
|
||||
|
||||
impl SearcherContext {
|
||||
/// Creates a context from explicit search-time components.
|
||||
pub fn new(
|
||||
schema: Schema,
|
||||
executor: Executor,
|
||||
tokenizers: TokenizerManager,
|
||||
fast_field_tokenizers: TokenizerManager,
|
||||
) -> SearcherContext {
|
||||
SearcherContext {
|
||||
schema,
|
||||
executor,
|
||||
tokenizers,
|
||||
fast_field_tokenizers,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a context from an index.
|
||||
pub fn from_index<C: crate::codec::Codec>(index: &Index<C>) -> SearcherContext {
|
||||
SearcherContext::new(
|
||||
index.schema(),
|
||||
index.search_executor().clone(),
|
||||
index.tokenizers().clone(),
|
||||
index.fast_field_tokenizer().clone(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Access the schema associated with this context.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Access the executor associated with this context.
|
||||
pub fn search_executor(&self) -> &Executor {
|
||||
&self.executor
|
||||
}
|
||||
|
||||
/// Access the tokenizer manager associated with this context.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Access the fast field tokenizer manager associated with this context.
|
||||
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
|
||||
&self.fast_field_tokenizers
|
||||
}
|
||||
|
||||
/// Get the tokenizer associated with a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let indexing_options_opt = match field_type {
|
||||
FieldType::JsonObject(options) => options.get_text_indexing_options(),
|
||||
FieldType::Str(options) => options.get_indexing_options(),
|
||||
_ => {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
)))
|
||||
}
|
||||
};
|
||||
let indexing_options = indexing_options_opt.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No indexing options set for field {field_entry:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
self.tokenizers
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No Tokenizer found for field {field_entry:?}"
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: crate::codec::Codec> From<&Index<C>> for SearcherContext {
|
||||
fn from(index: &Index<C>) -> Self {
|
||||
SearcherContext::from_index(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: crate::codec::Codec> From<Index<C>> for SearcherContext {
|
||||
fn from(index: Index<C>) -> Self {
|
||||
SearcherContext::from(&index)
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
@@ -168,66 +71,9 @@ pub struct Searcher {
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a `Searcher` from an arbitrary list of segment readers.
|
||||
///
|
||||
/// This is useful when segment readers are not opened from
|
||||
/// `IndexReader` / `meta.json` (e.g. external segment sources).
|
||||
/// The generated [`SearcherGeneration`] uses `generation_id = 0`.
|
||||
pub fn from_segment_readers<Ctx: Into<SearcherContext>>(
|
||||
context: Ctx,
|
||||
segment_readers: Vec<Arc<dyn SegmentReader>>,
|
||||
) -> crate::Result<Searcher> {
|
||||
Self::from_segment_readers_with_generation_id(context, segment_readers, 0)
|
||||
}
|
||||
|
||||
/// Same as [`Searcher::from_segment_readers`] but allows setting
|
||||
/// a custom generation id.
|
||||
pub fn from_segment_readers_with_generation_id<Ctx: Into<SearcherContext>>(
|
||||
context: Ctx,
|
||||
segment_readers: Vec<Arc<dyn SegmentReader>>,
|
||||
generation_id: u64,
|
||||
) -> crate::Result<Searcher> {
|
||||
let context = context.into();
|
||||
let generation = SearcherGeneration::from_segment_readers(&segment_readers, generation_id);
|
||||
let tracked_generation = Inventory::default().track(generation);
|
||||
let inner = SearcherInner::new(
|
||||
context,
|
||||
segment_readers,
|
||||
tracked_generation,
|
||||
DOCSTORE_CACHE_CAPACITY,
|
||||
)?;
|
||||
Ok(Arc::new(inner).into())
|
||||
}
|
||||
|
||||
/// Returns the search context associated with the `Searcher`.
|
||||
pub fn context(&self) -> &SearcherContext {
|
||||
&self.inner.context
|
||||
}
|
||||
|
||||
/// Deprecated alias for [`Searcher::context`].
|
||||
#[deprecated(note = "use Searcher::context()")]
|
||||
pub fn index(&self) -> &SearcherContext {
|
||||
self.context()
|
||||
}
|
||||
|
||||
/// Access the search executor associated with this searcher.
|
||||
pub fn search_executor(&self) -> &Executor {
|
||||
self.context().search_executor()
|
||||
}
|
||||
|
||||
/// Access the tokenizer manager associated with this searcher.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
self.context().tokenizers()
|
||||
}
|
||||
|
||||
/// Access the fast field tokenizer manager associated with this searcher.
|
||||
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
|
||||
self.context().fast_field_tokenizer()
|
||||
}
|
||||
|
||||
/// Get the tokenizer associated with a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
|
||||
self.context().tokenizer_for_field(field)
|
||||
/// Returns the `Index` associated with the `Searcher`
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.inner.index
|
||||
}
|
||||
|
||||
/// [`SearcherGeneration`] which identifies the version of the snapshot held by this `Searcher`.
|
||||
@@ -239,7 +85,7 @@ impl Searcher {
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<TantivyDocument> {
|
||||
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> {
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get(doc_address.doc_id)
|
||||
}
|
||||
@@ -259,15 +105,18 @@ impl Searcher {
|
||||
|
||||
/// Fetches a document in an asynchronous manner.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<TantivyDocument> {
|
||||
let executor = self.search_executor();
|
||||
pub async fn doc_async<D: DocumentDeserialize>(
|
||||
&self,
|
||||
doc_address: DocAddress,
|
||||
) -> crate::Result<D> {
|
||||
let executor = self.inner.index.search_executor();
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get_async(doc_address.doc_id, executor).await
|
||||
}
|
||||
|
||||
/// Access the schema associated with the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
self.context().schema()
|
||||
&self.inner.schema
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
@@ -305,13 +154,13 @@ impl Searcher {
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self) -> &[Arc<dyn SegmentReader>] {
|
||||
pub fn segment_readers(&self) -> &[SegmentReader] {
|
||||
&self.inner.segment_readers
|
||||
}
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ord
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &dyn SegmentReader {
|
||||
self.inner.segment_readers[segment_ord as usize].as_ref()
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.inner.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher.
|
||||
@@ -352,7 +201,7 @@ impl Searcher {
|
||||
} else {
|
||||
EnableScoring::disabled_from_searcher(self)
|
||||
};
|
||||
let executor = self.search_executor();
|
||||
let executor = self.inner.index.search_executor();
|
||||
self.search_with_executor(query, collector, executor, enabled_scoring)
|
||||
}
|
||||
|
||||
@@ -380,11 +229,7 @@ impl Searcher {
|
||||
let segment_readers = self.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
collector.collect_segment(
|
||||
weight.as_ref(),
|
||||
segment_ord as u32,
|
||||
segment_reader.as_ref(),
|
||||
)
|
||||
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
@@ -412,17 +257,19 @@ impl From<Arc<SearcherInner>> for Searcher {
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
pub(crate) struct SearcherInner {
|
||||
context: SearcherContext,
|
||||
segment_readers: Vec<Arc<dyn SegmentReader>>,
|
||||
store_readers: Vec<Box<dyn StoreReader>>,
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl SearcherInner {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(
|
||||
context: SearcherContext,
|
||||
segment_readers: Vec<Arc<dyn SegmentReader>>,
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
) -> io::Result<SearcherInner> {
|
||||
@@ -434,13 +281,14 @@ impl SearcherInner {
|
||||
generation.segments(),
|
||||
"Set of segments referenced by this Searcher and its SearcherGeneration must match"
|
||||
);
|
||||
let store_readers: Vec<Box<dyn StoreReader>> = segment_readers
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks))
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(SearcherInner {
|
||||
context,
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
generation,
|
||||
@@ -453,7 +301,7 @@ impl fmt::Debug for Searcher {
|
||||
let segment_ids = self
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({segment_ids:?})")
|
||||
}
|
||||
|
||||
@@ -7,8 +7,8 @@ use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{
|
||||
Directory, DocSet, Executor, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter,
|
||||
ReloadPolicy, Searcher, SearcherContext, TantivyDocument, Term,
|
||||
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
|
||||
TantivyDocument, Term,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -300,40 +300,6 @@ fn test_single_segment_index_writer() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_from_external_segment_readers() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer: IndexWriter = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(text_field => "hello"))?;
|
||||
writer.add_document(doc!(text_field => "hello"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_readers = searcher.segment_readers().to_vec();
|
||||
let context = SearcherContext::new(
|
||||
schema,
|
||||
Executor::single_thread(),
|
||||
TokenizerManager::default(),
|
||||
TokenizerManager::default(),
|
||||
);
|
||||
let custom_searcher =
|
||||
Searcher::from_segment_readers_with_generation_id(context, segment_readers, 42)?;
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let count = custom_searcher.search(&term_query, &Count)?;
|
||||
assert_eq!(count, 2);
|
||||
assert_eq!(custom_searcher.generation().generation_id(), 42);
|
||||
assert_eq!(custom_searcher.segment_readers().len(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merging_segment_update_docfreq() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -167,9 +167,6 @@ impl CompositeFile {
|
||||
.map(|byte_range| self.data.slice(byte_range.clone()))
|
||||
}
|
||||
|
||||
/// Returns per-field byte usage for all slices stored in this composite file.
|
||||
///
|
||||
/// The provided `schema` is used to resolve field ids into field names.
|
||||
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
|
||||
let mut fields = Vec::new();
|
||||
for (&field_addr, byte_range) in &self.offsets_index {
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
use std::borrow::BorrowMut;
|
||||
use std::ops::{Deref as _, DerefMut as _};
|
||||
|
||||
use common::BitSet;
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
@@ -133,19 +130,6 @@ pub trait DocSet: Send {
|
||||
buffer.len()
|
||||
}
|
||||
|
||||
/// Fills the given bitset with the documents in the docset.
|
||||
///
|
||||
/// If the docset max_doc is smaller than the largest doc, this function might not consume the
|
||||
/// docset entirely.
|
||||
fn fill_bitset(&mut self, bitset: &mut BitSet) {
|
||||
let bitset_max_value: u32 = bitset.max_value();
|
||||
let mut doc = self.doc();
|
||||
while doc < bitset_max_value {
|
||||
bitset.insert(doc);
|
||||
doc = self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
/// Right after creating a new `DocSet`, the docset points to the first document.
|
||||
///
|
||||
@@ -249,59 +233,51 @@ impl DocSet for &mut dyn DocSet {
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
(**self).count_including_deleted()
|
||||
}
|
||||
|
||||
fn fill_bitset(&mut self, bitset: &mut BitSet) {
|
||||
(**self).fill_bitset(bitset);
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
#[inline]
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.deref_mut().advance()
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.advance()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.deref_mut().seek(target)
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.seek(target)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.seek_danger(target)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
self.deref_mut().fill_buffer(buffer)
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.fill_buffer(buffer)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.deref().doc()
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.doc()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.deref().size_hint()
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cost(&self) -> u64 {
|
||||
self.deref().cost()
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.cost()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
self.deref_mut().count(alive_bitset)
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count(alive_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
self.deref_mut().count_including_deleted()
|
||||
}
|
||||
|
||||
fn fill_bitset(&mut self, bitset: &mut BitSet) {
|
||||
self.deref_mut().fill_bitset(bitset);
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count_including_deleted()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,7 +84,9 @@ mod tests {
|
||||
let mut facet = Facet::default();
|
||||
facet_reader.facet_from_ord(0, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/a/b");
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
|
||||
let doc = searcher
|
||||
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
|
||||
.unwrap();
|
||||
let value = doc
|
||||
.get_first(facet_field)
|
||||
.and_then(|v| v.as_value().as_facet());
|
||||
@@ -143,7 +145,7 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_ords.extend(facet_reader.facet_ords(0u32));
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<Facet> = doc
|
||||
.get_first(facet_field)
|
||||
.and_then(|v| v.as_facet())
|
||||
|
||||
@@ -96,7 +96,7 @@ mod tests {
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
|
||||
use crate::{Index, IndexWriter};
|
||||
use crate::{Index, IndexWriter, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -430,7 +430,7 @@ mod tests {
|
||||
.searcher()
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect();
|
||||
assert_eq!(segment_ids.len(), 2);
|
||||
index_writer.merge(&segment_ids[..]).wait().unwrap();
|
||||
|
||||
@@ -25,8 +25,7 @@ pub struct FastFieldReaders {
|
||||
}
|
||||
|
||||
impl FastFieldReaders {
|
||||
/// Opens the segment fast-field container and binds it to a schema.
|
||||
pub fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result<FastFieldReaders> {
|
||||
pub(crate) fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result<FastFieldReaders> {
|
||||
let columnar = Arc::new(ColumnarReader::open(fast_field_file)?);
|
||||
Ok(FastFieldReaders { columnar, schema })
|
||||
}
|
||||
@@ -40,8 +39,7 @@ impl FastFieldReaders {
|
||||
self.resolve_column_name_given_default_field(column_name, default_field_opt)
|
||||
}
|
||||
|
||||
/// Returns per-field space usage for all loaded fast-field columns.
|
||||
pub fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> {
|
||||
pub(crate) fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> {
|
||||
let mut per_field_usages: Vec<FieldUsage> = Default::default();
|
||||
for (mut field_name, column_handle) in self.columnar.iter_columns()? {
|
||||
json_path_sep_to_dot(&mut field_name);
|
||||
@@ -53,8 +51,7 @@ impl FastFieldReaders {
|
||||
Ok(PerFieldSpaceUsage::new(per_field_usages))
|
||||
}
|
||||
|
||||
/// Returns the underlying `ColumnarReader`.
|
||||
pub fn columnar(&self) -> &ColumnarReader {
|
||||
pub(crate) fn columnar(&self) -> &ColumnarReader {
|
||||
self.columnar.as_ref()
|
||||
}
|
||||
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::codec::{Codec, StandardCodec};
|
||||
|
||||
/// A Codec configuration is just a serializable object.
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct CodecConfiguration {
|
||||
codec_id: Cow<'static, str>,
|
||||
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
|
||||
props: serde_json::Value,
|
||||
}
|
||||
|
||||
impl CodecConfiguration {
|
||||
/// Returns true if the codec is the standard codec.
|
||||
pub fn is_standard(&self) -> bool {
|
||||
self.codec_id == StandardCodec::ID && self.props.is_null()
|
||||
}
|
||||
|
||||
/// Creates a codec instance from the configuration.
|
||||
///
|
||||
/// If the codec id does not match the code's name, an error is returned.
|
||||
pub fn to_codec<C: Codec>(&self) -> crate::Result<C> {
|
||||
if self.codec_id != C::ID {
|
||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Codec id mismatch: expected {}, got {}",
|
||||
C::ID,
|
||||
self.codec_id
|
||||
)));
|
||||
}
|
||||
C::from_json_props(&self.props)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: Codec> From<&'a C> for CodecConfiguration {
|
||||
fn from(codec: &'a C) -> Self {
|
||||
CodecConfiguration {
|
||||
codec_id: Cow::Borrowed(C::ID),
|
||||
props: codec.to_json_props(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CodecConfiguration {
|
||||
fn default() -> Self {
|
||||
CodecConfiguration::from(&StandardCodec)
|
||||
}
|
||||
}
|
||||
@@ -8,14 +8,12 @@ use std::thread::available_parallelism;
|
||||
use super::segment::Segment;
|
||||
use super::segment_reader::merge_field_meta_data;
|
||||
use super::{FieldMetadata, IndexSettings};
|
||||
use crate::codec::StandardCodec;
|
||||
use crate::core::{Executor, META_FILEPATH};
|
||||
use crate::directory::error::OpenReadError;
|
||||
#[cfg(feature = "mmap")]
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::index::codec_configuration::CodecConfiguration;
|
||||
use crate::index::{IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::index_writer::{
|
||||
IndexWriterOptions, MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN,
|
||||
@@ -26,6 +24,7 @@ use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::SegmentReader;
|
||||
|
||||
fn load_metas(
|
||||
directory: &dyn Directory,
|
||||
@@ -60,7 +59,6 @@ fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
codec: CodecConfiguration,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
@@ -69,7 +67,6 @@ fn save_new_metas(
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
codec,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
@@ -104,21 +101,18 @@ fn save_new_metas(
|
||||
/// };
|
||||
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
|
||||
/// ```
|
||||
pub struct IndexBuilder<Codec: crate::codec::Codec = StandardCodec> {
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
index_settings: IndexSettings,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
fast_field_tokenizer_manager: TokenizerManager,
|
||||
codec: Codec,
|
||||
}
|
||||
|
||||
impl Default for IndexBuilder<StandardCodec> {
|
||||
impl Default for IndexBuilder {
|
||||
fn default() -> Self {
|
||||
IndexBuilder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexBuilder<StandardCodec> {
|
||||
impl IndexBuilder {
|
||||
/// Creates a new `IndexBuilder`
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
@@ -126,21 +120,6 @@ impl IndexBuilder<StandardCodec> {
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
fast_field_tokenizer_manager: TokenizerManager::default(),
|
||||
codec: StandardCodec,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
/// Set the codec
|
||||
#[must_use]
|
||||
pub fn codec<NewCodec: crate::codec::Codec>(self, codec: NewCodec) -> IndexBuilder<NewCodec> {
|
||||
IndexBuilder {
|
||||
schema: self.schema,
|
||||
index_settings: self.index_settings,
|
||||
tokenizer_manager: self.tokenizer_manager,
|
||||
fast_field_tokenizer_manager: self.fast_field_tokenizer_manager,
|
||||
codec,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,7 +154,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This is useful for indexing small set of documents
|
||||
/// for instances like unit test or temporary in memory index.
|
||||
pub fn create_in_ram(self) -> Result<Index<Codec>, TantivyError> {
|
||||
pub fn create_in_ram(self) -> Result<Index, TantivyError> {
|
||||
let ram_directory = RamDirectory::create();
|
||||
self.create(ram_directory)
|
||||
}
|
||||
@@ -186,7 +165,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
/// If a previous index was in this directory, it returns an
|
||||
/// [`TantivyError::IndexAlreadyExists`] error.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index<Codec>> {
|
||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
|
||||
if Index::exists(&*mmap_directory)? {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
@@ -207,7 +186,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
self,
|
||||
dir: impl Into<Box<dyn Directory>>,
|
||||
mem_budget: usize,
|
||||
) -> crate::Result<SingleSegmentIndexWriter<Codec, D>> {
|
||||
) -> crate::Result<SingleSegmentIndexWriter<D>> {
|
||||
let index = self.create(dir)?;
|
||||
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
|
||||
Ok(index_simple_writer)
|
||||
@@ -223,7 +202,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
/// For other unit tests, prefer the [`RamDirectory`], see:
|
||||
/// [`IndexBuilder::create_in_ram()`].
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(self) -> crate::Result<Index<Codec>> {
|
||||
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
@@ -236,15 +215,12 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
|
||||
self,
|
||||
dir: T,
|
||||
) -> crate::Result<Index<Codec>> {
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
let dir: Box<dyn Directory> = dir.into();
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let mut index: Index<Codec> = Index::<Codec>::open_with_codec(dir)?;
|
||||
let mut index = Index::open(dir)?;
|
||||
index.set_tokenizers(self.tokenizer_manager.clone());
|
||||
if index.schema() == self.get_expect_schema()? {
|
||||
Ok(index)
|
||||
@@ -268,25 +244,18 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
pub fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index<Codec>> {
|
||||
self.create_avoid_monomorphization(dir.into())
|
||||
}
|
||||
|
||||
fn create_avoid_monomorphization(self, dir: Box<dyn Directory>) -> crate::Result<Index<Codec>> {
|
||||
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
self.validate()?;
|
||||
let dir = dir.into();
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
let codec: CodecConfiguration = CodecConfiguration::from(&self.codec);
|
||||
save_new_metas(
|
||||
self.get_expect_schema()?,
|
||||
self.index_settings.clone(),
|
||||
&directory,
|
||||
codec,
|
||||
)?;
|
||||
let schema = self.get_expect_schema()?;
|
||||
let mut metas = IndexMeta::with_schema_and_codec(schema, &self.codec);
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings;
|
||||
let mut index: Index<Codec> =
|
||||
Index::<Codec>::open_from_metas(directory, &metas, SegmentMetaInventory::default())?;
|
||||
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
index.set_tokenizers(self.tokenizer_manager);
|
||||
index.set_fast_field_tokenizers(self.fast_field_tokenizer_manager);
|
||||
Ok(index)
|
||||
@@ -295,7 +264,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
|
||||
|
||||
/// Search Index
|
||||
#[derive(Clone)]
|
||||
pub struct Index<Codec: crate::codec::Codec = crate::codec::StandardCodec> {
|
||||
pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
@@ -303,7 +272,6 @@ pub struct Index<Codec: crate::codec::Codec = crate::codec::StandardCodec> {
|
||||
tokenizers: TokenizerManager,
|
||||
fast_field_tokenizers: TokenizerManager,
|
||||
inventory: SegmentMetaInventory,
|
||||
codec: Codec,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
@@ -311,6 +279,41 @@ impl Index {
|
||||
pub fn builder() -> IndexBuilder {
|
||||
IndexBuilder::new()
|
||||
}
|
||||
/// Examines the directory to see if it contains an index.
|
||||
///
|
||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
|
||||
dir.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
/// Accessor to the search executor.
|
||||
///
|
||||
/// This pool is used by default when calling `searcher.search(...)`
|
||||
/// to perform search on the individual segments.
|
||||
///
|
||||
/// By default the executor is single thread, and simply runs in the calling thread.
|
||||
pub fn search_executor(&self) -> &Executor {
|
||||
&self.executor
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
||||
self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Custom thread pool by a outer thread pool.
|
||||
pub fn set_executor(&mut self, executor: Executor) {
|
||||
self.executor = executor;
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with as many threads as there are CPUs on the system.
|
||||
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
|
||||
let default_num_threads = available_parallelism()?.get();
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
}
|
||||
|
||||
/// Creates a new index using the [`RamDirectory`].
|
||||
///
|
||||
@@ -321,13 +324,6 @@ impl Index {
|
||||
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
|
||||
}
|
||||
|
||||
/// Examines the directory to see if it contains an index.
|
||||
///
|
||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||
pub fn exists(directory: &dyn Directory) -> Result<bool, OpenReadError> {
|
||||
directory.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the [`MmapDirectory`].
|
||||
///
|
||||
@@ -374,92 +370,20 @@ impl Index {
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
) -> crate::Result<Index> {
|
||||
Self::create_to_avoid_monomorphization(dir.into(), schema, settings)
|
||||
}
|
||||
|
||||
fn create_to_avoid_monomorphization(
|
||||
dir: Box<dyn Directory>,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
) -> crate::Result<Index> {
|
||||
let dir: Box<dyn Directory> = dir.into();
|
||||
let mut builder = IndexBuilder::new().schema(schema);
|
||||
builder = builder.settings(settings);
|
||||
builder.create(dir)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> crate::Result<Index> {
|
||||
Self::open_in_dir_to_avoid_monomorphization(directory_path.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[inline(never)]
|
||||
fn open_in_dir_to_avoid_monomorphization(directory_path: &Path) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
|
||||
Index::<StandardCodec>::open_with_codec(directory.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
/// Open the index using the provided directory
|
||||
#[inline(never)]
|
||||
pub fn open_with_codec(directory: Box<dyn Directory>) -> crate::Result<Index<Codec>> {
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
let index: Index<Codec> = Index::<Codec>::open_from_metas(directory, &metas, inventory)?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Accessor to the codec.
|
||||
pub fn codec(&self) -> &Codec {
|
||||
&self.codec
|
||||
}
|
||||
|
||||
/// Accessor to the search executor.
|
||||
///
|
||||
/// This pool is used by default when calling `searcher.search(...)`
|
||||
/// to perform search on the individual segments.
|
||||
///
|
||||
/// By default the executor is single thread, and simply runs in the calling thread.
|
||||
pub fn search_executor(&self) -> &Executor {
|
||||
&self.executor
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
||||
self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Custom thread pool by a outer thread pool.
|
||||
pub fn set_executor(&mut self, executor: Executor) {
|
||||
self.executor = executor;
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with as many threads as there are CPUs on the system.
|
||||
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
|
||||
let default_num_threads = available_parallelism()?.get();
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an [`IndexMeta`].
|
||||
fn open_from_metas<C: crate::codec::Codec>(
|
||||
fn open_from_metas(
|
||||
directory: ManagedDirectory,
|
||||
metas: &IndexMeta,
|
||||
inventory: SegmentMetaInventory,
|
||||
) -> crate::Result<Index<C>> {
|
||||
) -> Index {
|
||||
let schema = metas.schema.clone();
|
||||
let codec = metas.codec.to_codec::<C>()?;
|
||||
Ok(Index {
|
||||
Index {
|
||||
settings: metas.index_settings.clone(),
|
||||
directory,
|
||||
schema,
|
||||
@@ -467,8 +391,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
fast_field_tokenizers: TokenizerManager::default(),
|
||||
executor: Executor::single_thread(),
|
||||
inventory,
|
||||
codec,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Setter for the tokenizer manager.
|
||||
@@ -524,7 +447,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
/// Create a default [`IndexReader`] for the given index.
|
||||
///
|
||||
/// See [`Index.reader_builder()`].
|
||||
pub fn reader(&self) -> crate::Result<IndexReader<Codec>> {
|
||||
pub fn reader(&self) -> crate::Result<IndexReader> {
|
||||
self.reader_builder().try_into()
|
||||
}
|
||||
|
||||
@@ -532,10 +455,17 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
///
|
||||
/// Most project should create at most one reader for a given index.
|
||||
/// This method is typically called only once per `Index` instance.
|
||||
pub fn reader_builder(&self) -> IndexReaderBuilder<Codec> {
|
||||
pub fn reader_builder(&self) -> IndexReaderBuilder {
|
||||
IndexReaderBuilder::new(self.clone())
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
/// Returns the list of the segment metas tracked by the index.
|
||||
///
|
||||
/// Such segments can of course be part of the index,
|
||||
@@ -562,15 +492,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
let segments = self.searchable_segments()?;
|
||||
let fields_metadata: Vec<Vec<FieldMetadata>> = segments
|
||||
.into_iter()
|
||||
.map(|segment| {
|
||||
let segment_reader = segment.index().codec().open_segment_reader(
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
None,
|
||||
)?;
|
||||
segment_reader.fields_metadata()
|
||||
})
|
||||
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
|
||||
.collect::<Result<_, _>>()?;
|
||||
Ok(merge_field_meta_data(fields_metadata))
|
||||
}
|
||||
@@ -584,6 +506,16 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
self.inventory.new_segment_meta(segment_id, max_doc)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
|
||||
let directory = directory.into();
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
let index = Index::open_from_metas(directory, &metas, inventory);
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> crate::Result<IndexMeta> {
|
||||
load_metas(self.directory(), &self.inventory)
|
||||
@@ -607,7 +539,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
pub fn writer_with_options<D: Document>(
|
||||
&self,
|
||||
options: IndexWriterOptions,
|
||||
) -> crate::Result<IndexWriter<Codec, D>> {
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let directory_lock = self
|
||||
.directory
|
||||
.acquire_lock(&INDEX_WRITER_LOCK)
|
||||
@@ -649,7 +581,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
&self,
|
||||
num_threads: usize,
|
||||
overall_memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter<Codec, D>> {
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads;
|
||||
let options = IndexWriterOptions::builder()
|
||||
.num_worker_threads(num_threads)
|
||||
@@ -663,7 +595,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
/// That index writer only simply has a single thread and a memory budget of 15 MB.
|
||||
/// Using a single thread gives us a deterministic allocation of DocId.
|
||||
#[cfg(test)]
|
||||
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<Codec, D>> {
|
||||
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
|
||||
self.writer_with_num_threads(1, MEMORY_BUDGET_NUM_BYTES_MIN)
|
||||
}
|
||||
|
||||
@@ -681,7 +613,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
pub fn writer<D: Document>(
|
||||
&self,
|
||||
memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter<Codec, D>> {
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD);
|
||||
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
|
||||
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
|
||||
@@ -708,7 +640,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
}
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> crate::Result<Vec<Segment<Codec>>> {
|
||||
pub fn searchable_segments(&self) -> crate::Result<Vec<Segment>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
@@ -717,12 +649,12 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment<Codec> {
|
||||
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
|
||||
Segment::for_index(self.clone(), segment_meta)
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
pub fn new_segment(&self) -> Segment<Codec> {
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
let segment_meta = self
|
||||
.inventory
|
||||
.new_segment_meta(SegmentId::generate_random(), 0);
|
||||
@@ -776,7 +708,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
|
||||
}
|
||||
|
||||
impl fmt::Debug for Index {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Index({:?})", self.directory)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,7 @@ use std::path::PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::SegmentComponent;
|
||||
use crate::codec::Codec;
|
||||
use crate::index::{CodecConfiguration, SegmentId};
|
||||
use crate::index::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::store::Compressor;
|
||||
use crate::{Inventory, Opstamp, TrackedObject};
|
||||
@@ -287,10 +286,8 @@ pub struct IndexMeta {
|
||||
/// This payload is entirely unused by tantivy.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
/// Codec configuration for the index.
|
||||
#[serde(skip_serializing_if = "CodecConfiguration::is_standard")]
|
||||
pub codec: CodecConfiguration,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct UntrackedIndexMeta {
|
||||
pub segments: Vec<InnerSegmentMeta>,
|
||||
@@ -300,8 +297,6 @@ struct UntrackedIndexMeta {
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
#[serde(default)]
|
||||
pub codec: CodecConfiguration,
|
||||
}
|
||||
|
||||
impl UntrackedIndexMeta {
|
||||
@@ -316,7 +311,6 @@ impl UntrackedIndexMeta {
|
||||
schema: self.schema,
|
||||
opstamp: self.opstamp,
|
||||
payload: self.payload,
|
||||
codec: self.codec,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -327,14 +321,13 @@ impl IndexMeta {
|
||||
///
|
||||
/// This new index does not contains any segments.
|
||||
/// Opstamp will the value `0u64`.
|
||||
pub fn with_schema_and_codec<C: Codec>(schema: Schema, codec: &C) -> IndexMeta {
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
index_settings: IndexSettings::default(),
|
||||
segments: vec![],
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
codec: CodecConfiguration::from(codec),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,38 +378,14 @@ mod tests {
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
codec: Default::default(),
|
||||
};
|
||||
let json_value: serde_json::Value =
|
||||
serde_json::to_value(&index_metas).expect("serialization failed");
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
&json_value,
|
||||
&serde_json::json!(
|
||||
{
|
||||
"index_settings": {
|
||||
"docstore_compression": "none",
|
||||
"docstore_blocksize": 16384
|
||||
},
|
||||
"segments": [],
|
||||
"schema": [
|
||||
{
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"opstamp": 0
|
||||
})
|
||||
json,
|
||||
r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap();
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
|
||||
assert_eq!(index_metas.schema, deser_meta.schema);
|
||||
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
|
||||
@@ -442,39 +411,14 @@ mod tests {
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
codec: Default::default(),
|
||||
};
|
||||
let json_value = serde_json::to_value(&index_metas).expect("serialization failed");
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
&json_value,
|
||||
&serde_json::json!(
|
||||
{
|
||||
"index_settings": {
|
||||
"docstore_compression": "zstd(compression_level=4)",
|
||||
"docstore_blocksize": 1000000
|
||||
},
|
||||
"segments": [],
|
||||
"schema": [
|
||||
{
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"opstamp": 0
|
||||
}
|
||||
)
|
||||
json,
|
||||
r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap();
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
|
||||
assert_eq!(index_metas.schema, deser_meta.schema);
|
||||
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
#[cfg(feature = "quickwit")]
|
||||
use std::future::Future;
|
||||
use std::io;
|
||||
#[cfg(feature = "quickwit")]
|
||||
use std::pin::Pin;
|
||||
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::{BinarySerializable, BitSet, ByteCount, OwnedBytes};
|
||||
use common::{BinarySerializable, ByteCount};
|
||||
#[cfg(feature = "quickwit")]
|
||||
use futures_util::{FutureExt, StreamExt, TryStreamExt};
|
||||
#[cfg(feature = "quickwit")]
|
||||
@@ -13,213 +9,38 @@ use itertools::Itertools;
|
||||
#[cfg(feature = "quickwit")]
|
||||
use tantivy_fst::automaton::{AlwaysMatch, Automaton};
|
||||
|
||||
use crate::codec::postings::RawPostingsData;
|
||||
use crate::codec::standard::postings::{
|
||||
fill_bitset_from_raw_data, load_postings_from_raw_data, SegmentPostings,
|
||||
};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::{Postings, TermInfo};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{box_scorer, Bm25Weight, PhraseScorer, Scorer};
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
|
||||
use crate::schema::{IndexRecordOption, Term, Type};
|
||||
use crate::termdict::TermDictionary;
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub type TermRangeBounds = (std::ops::Bound<Term>, std::ops::Bound<Term>);
|
||||
|
||||
/// Type-erased term scorer guaranteed to wrap a Tantivy [`TermScorer`].
|
||||
pub struct BoxedTermScorer(Box<dyn Scorer>);
|
||||
|
||||
impl BoxedTermScorer {
|
||||
/// Creates a boxed term scorer from a concrete Tantivy [`TermScorer`].
|
||||
pub fn new<TPostings: Postings>(term_scorer: TermScorer<TPostings>) -> BoxedTermScorer {
|
||||
BoxedTermScorer(box_scorer(term_scorer))
|
||||
}
|
||||
|
||||
/// Converts this boxed term scorer into a generic boxed scorer.
|
||||
pub fn into_boxed_scorer(self) -> Box<dyn Scorer> {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait defining the contract for inverted index readers.
|
||||
pub trait InvertedIndexReader: Send + Sync {
|
||||
/// Returns the term info associated with the term.
|
||||
fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
|
||||
self.terms().get(term.serialized_value_bytes())
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
fn terms(&self) -> &TermDictionary;
|
||||
|
||||
/// Return the fields and types encoded in the dictionary in lexicographic order.
|
||||
/// Only valid on JSON fields.
|
||||
///
|
||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>>;
|
||||
|
||||
/// Build a new term scorer.
|
||||
fn new_term_scorer(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
) -> io::Result<BoxedTermScorer>;
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most users should prefer using [`Self::read_postings()`] instead.
|
||||
fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Box<dyn Postings>>;
|
||||
|
||||
/// Returns the raw postings bytes and metadata for a term.
|
||||
fn read_raw_postings_data(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<RawPostingsData>;
|
||||
|
||||
/// Fills a bitset with documents containing the term.
|
||||
///
|
||||
/// Implementers can override this to avoid boxing postings.
|
||||
fn fill_bitset_for_term(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
doc_bitset: &mut BitSet,
|
||||
) -> io::Result<()> {
|
||||
let mut postings = self.read_postings_from_terminfo(term_info, option)?;
|
||||
postings.fill_bitset(doc_bitset);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Builds a phrase scorer for the given term infos.
|
||||
fn new_phrase_scorer(
|
||||
&self,
|
||||
term_infos: &[(usize, TermInfo)],
|
||||
similarity_weight: Option<Bm25Weight>,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
slop: u32,
|
||||
) -> io::Result<Box<dyn Scorer>>;
|
||||
|
||||
/// Returns the total number of tokens recorded for all documents
|
||||
/// (including deleted documents).
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
fn read_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Option<Box<dyn Postings>>> {
|
||||
self.get_term_info(term)?
|
||||
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
fn doc_freq(&self, term: &Term) -> io::Result<u32>;
|
||||
|
||||
/// Returns the number of documents containing the term asynchronously.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn doc_freq_async<'a>(
|
||||
&'a self,
|
||||
term: &'a Term,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<u32>> + Send + 'a>>;
|
||||
|
||||
/// Warmup fieldnorm readers for this inverted index field.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_fieldnorms_readers<'a>(
|
||||
&'a self,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<()>> + Send + 'a>>;
|
||||
|
||||
/// Warmup the block postings for all terms.
|
||||
///
|
||||
/// Default implementation is a no-op.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_full<'a>(
|
||||
&'a self,
|
||||
_with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<()>> + Send + 'a>> {
|
||||
Box::pin(async { Ok(()) })
|
||||
}
|
||||
|
||||
/// Warmup a block postings given a `Term`.
|
||||
///
|
||||
/// Returns whether the term was found in the dictionary.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings<'a>(
|
||||
&'a self,
|
||||
term: &'a Term,
|
||||
with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>>;
|
||||
|
||||
/// Warmup block postings for terms in a range.
|
||||
///
|
||||
/// Returns whether at least one matching term was found.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_range<'a>(
|
||||
&'a self,
|
||||
terms: TermRangeBounds,
|
||||
limit: Option<u64>,
|
||||
with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>>;
|
||||
|
||||
/// Warmup block postings for terms matching an automaton.
|
||||
///
|
||||
/// Returns whether at least one matching term was found.
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>(
|
||||
&'a self,
|
||||
automaton: A,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>>
|
||||
where
|
||||
A::State: Clone + Send,
|
||||
Self: Sized;
|
||||
}
|
||||
|
||||
/// Tantivy's default inverted index reader implementation.
|
||||
///
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated with a specific field.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// It is safe to delete the segment associated with
|
||||
/// an `InvertedIndexReader` implementation. As long as it is open,
|
||||
/// an `InvertedIndexReader`. As long as it is open,
|
||||
/// the [`FileSlice`] it is relying on should
|
||||
/// stay available.
|
||||
///
|
||||
/// `TantivyInvertedIndexReader` instances are created by calling
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index).
|
||||
pub struct TantivyInvertedIndexReader {
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
#[cfg_attr(not(feature = "quickwit"), allow(dead_code))]
|
||||
fieldnorms_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
|
||||
/// Object that records the amount of space used by a field in an inverted index.
|
||||
pub struct InvertedIndexFieldSpace {
|
||||
/// Field name as encoded in the term dictionary.
|
||||
pub(crate) struct InvertedIndexFieldSpace {
|
||||
pub field_name: String,
|
||||
/// Value type for the encoded field.
|
||||
pub field_type: Type,
|
||||
/// Total bytes used by postings for this field.
|
||||
pub postings_size: ByteCount,
|
||||
/// Total bytes used by positions for this field.
|
||||
pub positions_size: ByteCount,
|
||||
/// Number of terms in the field.
|
||||
pub num_terms: u64,
|
||||
}
|
||||
|
||||
@@ -241,86 +62,52 @@ impl InvertedIndexFieldSpace {
|
||||
}
|
||||
}
|
||||
|
||||
impl TantivyInvertedIndexReader {
|
||||
pub(crate) fn read_raw_postings_data_inner(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<RawPostingsData> {
|
||||
let effective_option = option.downgrade(self.record_option);
|
||||
let postings_data = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone())
|
||||
.read_bytes()?;
|
||||
let positions_data: Option<OwnedBytes> = if effective_option.has_positions() {
|
||||
let positions_data = self
|
||||
.positions_file_slice
|
||||
.slice(term_info.positions_range.clone())
|
||||
.read_bytes()?;
|
||||
Some(positions_data)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(RawPostingsData {
|
||||
postings_data,
|
||||
positions_data,
|
||||
record_option: self.record_option,
|
||||
effective_option,
|
||||
})
|
||||
}
|
||||
|
||||
/// Opens an inverted index reader from already-loaded term/postings/positions slices.
|
||||
///
|
||||
/// The first 8 bytes of `postings_file_slice` are expected to contain
|
||||
/// the serialized total token count.
|
||||
pub fn new(
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
fieldnorms_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
) -> io::Result<TantivyInvertedIndexReader> {
|
||||
) -> io::Result<InvertedIndexReader> {
|
||||
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
|
||||
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
|
||||
Ok(TantivyInvertedIndexReader {
|
||||
Ok(InvertedIndexReader {
|
||||
termdict,
|
||||
postings_file_slice: postings_body,
|
||||
positions_file_slice,
|
||||
fieldnorms_file_slice,
|
||||
record_option,
|
||||
total_num_tokens,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates an empty `TantivyInvertedIndexReader` object, which
|
||||
/// Creates an empty `InvertedIndexReader` object, which
|
||||
/// contains no terms at all.
|
||||
pub fn empty(record_option: IndexRecordOption) -> TantivyInvertedIndexReader {
|
||||
TantivyInvertedIndexReader {
|
||||
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionary::empty(),
|
||||
postings_file_slice: FileSlice::empty(),
|
||||
positions_file_slice: FileSlice::empty(),
|
||||
fieldnorms_file_slice: FileSlice::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
fn load_segment_postings(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<SegmentPostings> {
|
||||
let postings_data = self.read_raw_postings_data_inner(term_info, option)?;
|
||||
load_postings_from_raw_data(term_info.doc_freq, postings_data)
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
|
||||
self.termdict.get(term.serialized_value_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexReader for TantivyInvertedIndexReader {
|
||||
fn terms(&self) -> &TermDictionary {
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionary {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
|
||||
/// Return the fields and types encoded in the dictionary in lexicographic order.
|
||||
/// Only valid on JSON fields.
|
||||
///
|
||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||
/// TODO: Move to sstable to use the index.
|
||||
pub(crate) fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
|
||||
let mut stream = self.termdict.stream()?;
|
||||
let mut fields: Vec<InvertedIndexFieldSpace> = Vec::new();
|
||||
|
||||
@@ -373,260 +160,136 @@ impl InvertedIndexReader for TantivyInvertedIndexReader {
|
||||
Ok(fields)
|
||||
}
|
||||
|
||||
fn new_term_scorer(
|
||||
/// Resets the block segment to another position of the postings
|
||||
/// file.
|
||||
///
|
||||
/// This is useful for enumerating through a list of terms,
|
||||
/// and consuming the associated posting lists while avoiding
|
||||
/// reallocating a [`BlockSegmentPostings`].
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
) -> io::Result<BoxedTermScorer> {
|
||||
let postings = self.load_segment_postings(term_info, option)?;
|
||||
let term_scorer = TermScorer::new(postings, fieldnorm_reader, similarity_weight);
|
||||
Ok(BoxedTermScorer::new(term_scorer))
|
||||
}
|
||||
|
||||
fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Box<dyn Postings>> {
|
||||
let postings = self.load_segment_postings(term_info, option)?;
|
||||
Ok(Box::new(postings))
|
||||
}
|
||||
|
||||
fn read_raw_postings_data(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<RawPostingsData> {
|
||||
self.read_raw_postings_data_inner(term_info, option)
|
||||
}
|
||||
|
||||
fn fill_bitset_for_term(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
doc_bitset: &mut BitSet,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) -> io::Result<()> {
|
||||
let postings_data = self.read_raw_postings_data_inner(term_info, option)?;
|
||||
fill_bitset_from_raw_data(term_info.doc_freq, postings_data, doc_bitset)
|
||||
let postings_slice = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone());
|
||||
let postings_bytes = postings_slice.read_bytes()?;
|
||||
block_postings.reset(term_info.doc_freq, postings_bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn new_phrase_scorer(
|
||||
&self,
|
||||
term_infos: &[(usize, TermInfo)],
|
||||
similarity_weight: Option<Bm25Weight>,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
slop: u32,
|
||||
) -> io::Result<Box<dyn Scorer>> {
|
||||
let mut offset_and_term_postings: Vec<(usize, SegmentPostings)> =
|
||||
Vec::with_capacity(term_infos.len());
|
||||
for (offset, term_info) in term_infos {
|
||||
let postings =
|
||||
self.load_segment_postings(term_info, IndexRecordOption::WithFreqsAndPositions)?;
|
||||
offset_and_term_postings.push((*offset, postings));
|
||||
}
|
||||
let scorer = PhraseScorer::new(
|
||||
offset_and_term_postings,
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
slop,
|
||||
);
|
||||
Ok(box_scorer(scorer))
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
self.total_num_tokens
|
||||
}
|
||||
|
||||
fn read_postings(
|
||||
/// Returns a block postings given a `Term`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most users should prefer using [`Self::read_postings()`] instead.
|
||||
pub fn read_block_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Option<Box<dyn Postings>>> {
|
||||
) -> io::Result<Option<BlockSegmentPostings>> {
|
||||
self.get_term_info(term)?
|
||||
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most users should prefer using [`Self::read_postings()`] instead.
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> io::Result<BlockSegmentPostings> {
|
||||
let postings_data = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone());
|
||||
BlockSegmentPostings::open(
|
||||
term_info.doc_freq,
|
||||
postings_data,
|
||||
self.record_option,
|
||||
requested_option,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most users should prefer using [`Self::read_postings()`] instead.
|
||||
pub fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<SegmentPostings> {
|
||||
let option = option.downgrade(self.record_option);
|
||||
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
|
||||
let position_reader = {
|
||||
if option.has_positions() {
|
||||
let positions_data = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice(term_info.positions_range.clone())?;
|
||||
let position_reader = PositionReader::open(positions_data)?;
|
||||
Some(position_reader)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
Ok(SegmentPostings::from_block_postings(
|
||||
block_postings,
|
||||
position_reader,
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the total number of tokens recorded for all documents
|
||||
/// (including deleted documents).
|
||||
pub fn total_num_tokens(&self) -> u64 {
|
||||
self.total_num_tokens
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned [`SegmentPostings`] the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting [`IndexRecordOption::WithFreqs`] for a
|
||||
/// [`TextOptions`](crate::schema::TextOptions) that does not index position
|
||||
/// will return a [`SegmentPostings`] with `DocId`s and frequencies.
|
||||
pub fn read_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Option<SegmentPostings>> {
|
||||
self.get_term_info(term)?
|
||||
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
||||
Ok(self
|
||||
.get_term_info(term)?
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0u32))
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn doc_freq_async<'a>(
|
||||
&'a self,
|
||||
term: &'a Term,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<u32>> + Send + 'a>> {
|
||||
Box::pin(async move {
|
||||
Ok(self
|
||||
.get_term_info_async(term)
|
||||
.await?
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0u32))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_fieldnorms_readers<'a>(
|
||||
&'a self,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<()>> + Send + 'a>> {
|
||||
Box::pin(async move {
|
||||
self.fieldnorms_file_slice.read_bytes_async().await?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_full<'a>(
|
||||
&'a self,
|
||||
with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<()>> + Send + 'a>> {
|
||||
Box::pin(async move {
|
||||
self.postings_file_slice.read_bytes_async().await?;
|
||||
if with_positions {
|
||||
self.positions_file_slice.read_bytes_async().await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings<'a>(
|
||||
&'a self,
|
||||
term: &'a Term,
|
||||
with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>> {
|
||||
Box::pin(async move {
|
||||
let term_info_opt: Option<TermInfo> = self.get_term_info_async(term).await?;
|
||||
if let Some(term_info) = term_info_opt {
|
||||
let postings = self
|
||||
.postings_file_slice
|
||||
.read_bytes_slice_async(term_info.postings_range.clone());
|
||||
if with_positions {
|
||||
let positions = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice_async(term_info.positions_range.clone());
|
||||
futures_util::future::try_join(postings, positions).await?;
|
||||
} else {
|
||||
postings.await?;
|
||||
}
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_range<'a>(
|
||||
&'a self,
|
||||
terms: TermRangeBounds,
|
||||
limit: Option<u64>,
|
||||
with_positions: bool,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>> {
|
||||
Box::pin(async move {
|
||||
let mut term_info = self
|
||||
.get_term_range_async(terms, AlwaysMatch, limit, 0)
|
||||
.await?;
|
||||
|
||||
let Some(first_terminfo) = term_info.next() else {
|
||||
// no key matches, nothing more to load
|
||||
return Ok(false);
|
||||
};
|
||||
|
||||
let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone());
|
||||
|
||||
let postings_range =
|
||||
first_terminfo.postings_range.start..last_terminfo.postings_range.end;
|
||||
let positions_range =
|
||||
first_terminfo.positions_range.start..last_terminfo.positions_range.end;
|
||||
|
||||
let postings = self
|
||||
.postings_file_slice
|
||||
.read_bytes_slice_async(postings_range);
|
||||
if with_positions {
|
||||
let positions = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice_async(positions_range);
|
||||
futures_util::future::try_join(postings, positions).await?;
|
||||
} else {
|
||||
postings.await?;
|
||||
}
|
||||
Ok(true)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>(
|
||||
&'a self,
|
||||
automaton: A,
|
||||
) -> Pin<Box<dyn Future<Output = io::Result<bool>> + Send + 'a>>
|
||||
where
|
||||
A::State: Clone + Send,
|
||||
Self: Sized,
|
||||
{
|
||||
Box::pin(async move {
|
||||
// merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB
|
||||
// from S3 (~80MiB/s, and 50ms latency)
|
||||
const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000;
|
||||
// Trigger async prefetch of relevant termdict blocks.
|
||||
let _term_info_iter = self
|
||||
.get_term_range_async(
|
||||
(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded),
|
||||
automaton.clone(),
|
||||
None,
|
||||
MERGE_HOLES_UNDER_BYTES,
|
||||
)
|
||||
.await?;
|
||||
drop(_term_info_iter);
|
||||
|
||||
// Build a 2nd stream without merged holes so we only scan matching blocks.
|
||||
// This assumes the storage layer caches data fetched by the first pass.
|
||||
let mut stream = self.termdict.search(automaton).into_stream()?;
|
||||
let posting_ranges_iter =
|
||||
std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone()));
|
||||
let merged_posting_ranges: Vec<std::ops::Range<usize>> = posting_ranges_iter
|
||||
.coalesce(|range1, range2| {
|
||||
if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start {
|
||||
Ok(range1.start..range2.end)
|
||||
} else {
|
||||
Err((range1, range2))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if merged_posting_ranges.is_empty() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let slices_downloaded = futures_util::stream::iter(merged_posting_ranges.into_iter())
|
||||
.map(|posting_slice| {
|
||||
self.postings_file_slice
|
||||
.read_bytes_slice_async(posting_slice)
|
||||
.map(|result| result.map(|_slice| ()))
|
||||
})
|
||||
.buffer_unordered(5)
|
||||
.try_collect::<Vec<()>>()
|
||||
.await?;
|
||||
|
||||
Ok(!slices_downloaded.is_empty())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
impl TantivyInvertedIndexReader {
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
|
||||
self.termdict.get_async(term.serialized_value_bytes()).await
|
||||
}
|
||||
|
||||
async fn get_term_range_async<'a, A: Automaton + 'a>(
|
||||
&'a self,
|
||||
terms: TermRangeBounds,
|
||||
terms: impl std::ops::RangeBounds<Term>,
|
||||
automaton: A,
|
||||
limit: Option<u64>,
|
||||
merge_holes_under_bytes: usize,
|
||||
@@ -634,17 +297,17 @@ impl TantivyInvertedIndexReader {
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
use std::ops::Bound;
|
||||
let range_builder = self.termdict.search(automaton);
|
||||
let (start_bound, end_bound) = terms;
|
||||
let range_builder = match start_bound {
|
||||
std::ops::Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()),
|
||||
std::ops::Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()),
|
||||
std::ops::Bound::Unbounded => range_builder,
|
||||
let range_builder = match terms.start_bound() {
|
||||
Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()),
|
||||
Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()),
|
||||
Bound::Unbounded => range_builder,
|
||||
};
|
||||
let range_builder = match end_bound {
|
||||
std::ops::Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()),
|
||||
std::ops::Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()),
|
||||
std::ops::Bound::Unbounded => range_builder,
|
||||
let range_builder = match terms.end_bound() {
|
||||
Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()),
|
||||
Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()),
|
||||
Bound::Unbounded => range_builder,
|
||||
};
|
||||
let range_builder = if let Some(limit) = limit {
|
||||
range_builder.limit(limit)
|
||||
@@ -665,4 +328,167 @@ impl TantivyInvertedIndexReader {
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Warmup a block postings given a `Term`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// returns a boolean, whether the term was found in the dictionary
|
||||
pub async fn warm_postings(&self, term: &Term, with_positions: bool) -> io::Result<bool> {
|
||||
let term_info_opt: Option<TermInfo> = self.get_term_info_async(term).await?;
|
||||
if let Some(term_info) = term_info_opt {
|
||||
let postings = self
|
||||
.postings_file_slice
|
||||
.read_bytes_slice_async(term_info.postings_range.clone());
|
||||
if with_positions {
|
||||
let positions = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice_async(term_info.positions_range.clone());
|
||||
futures_util::future::try_join(postings, positions).await?;
|
||||
} else {
|
||||
postings.await?;
|
||||
}
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
/// Warmup a block postings given a range of `Term`s.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// returns a boolean, whether a term matching the range was found in the dictionary
|
||||
pub async fn warm_postings_range(
|
||||
&self,
|
||||
terms: impl std::ops::RangeBounds<Term>,
|
||||
limit: Option<u64>,
|
||||
with_positions: bool,
|
||||
) -> io::Result<bool> {
|
||||
let mut term_info = self
|
||||
.get_term_range_async(terms, AlwaysMatch, limit, 0)
|
||||
.await?;
|
||||
|
||||
let Some(first_terminfo) = term_info.next() else {
|
||||
// no key matches, nothing more to load
|
||||
return Ok(false);
|
||||
};
|
||||
|
||||
let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone());
|
||||
|
||||
let postings_range = first_terminfo.postings_range.start..last_terminfo.postings_range.end;
|
||||
let positions_range =
|
||||
first_terminfo.positions_range.start..last_terminfo.positions_range.end;
|
||||
|
||||
let postings = self
|
||||
.postings_file_slice
|
||||
.read_bytes_slice_async(postings_range);
|
||||
if with_positions {
|
||||
let positions = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice_async(positions_range);
|
||||
futures_util::future::try_join(postings, positions).await?;
|
||||
} else {
|
||||
postings.await?;
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Warmup a block postings given a range of `Term`s.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// returns a boolean, whether a term matching the range was found in the dictionary
|
||||
pub async fn warm_postings_automaton<
|
||||
A: Automaton + Clone + Send + 'static,
|
||||
E: FnOnce(Box<dyn FnOnce() -> io::Result<()> + Send>) -> F,
|
||||
F: std::future::Future<Output = io::Result<()>>,
|
||||
>(
|
||||
&self,
|
||||
automaton: A,
|
||||
// with_positions: bool, at the moment we have no use for it, and supporting it would add
|
||||
// complexity to the coalesce
|
||||
executor: E,
|
||||
) -> io::Result<bool>
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
// merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB from
|
||||
// S3 (~80MiB/s, and 50ms latency)
|
||||
const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000;
|
||||
// we build a first iterator to download everything. Simply calling the function already
|
||||
// download everything we need from the sstable, but doesn't start iterating over it.
|
||||
let _term_info_iter = self
|
||||
.get_term_range_async(.., automaton.clone(), None, MERGE_HOLES_UNDER_BYTES)
|
||||
.await?;
|
||||
|
||||
let (sender, posting_ranges_to_load_stream) = futures_channel::mpsc::unbounded();
|
||||
let termdict = self.termdict.clone();
|
||||
let cpu_bound_task = move || {
|
||||
// then we build a 2nd iterator, this one with no holes, so we don't go through blocks
|
||||
// we can't match.
|
||||
// This makes the assumption there is a caching layer below us, which gives sync read
|
||||
// for free after the initial async access. This might not always be true, but is in
|
||||
// Quickwit.
|
||||
// We build things from this closure otherwise we get into lifetime issues that can only
|
||||
// be solved with self referential strucs. Returning an io::Result from here is a bit
|
||||
// more leaky abstraction-wise, but a lot better than the alternative
|
||||
let mut stream = termdict.search(automaton).into_stream()?;
|
||||
|
||||
// we could do without an iterator, but this allows us access to coalesce which simplify
|
||||
// things
|
||||
let posting_ranges_iter =
|
||||
std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone()));
|
||||
|
||||
let merged_posting_ranges_iter = posting_ranges_iter.coalesce(|range1, range2| {
|
||||
if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start {
|
||||
Ok(range1.start..range2.end)
|
||||
} else {
|
||||
Err((range1, range2))
|
||||
}
|
||||
});
|
||||
|
||||
for posting_range in merged_posting_ranges_iter {
|
||||
if let Err(_) = sender.unbounded_send(posting_range) {
|
||||
// this should happen only when search is cancelled
|
||||
return Err(io::Error::other("failed to send posting range back"));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
let task_handle = executor(Box::new(cpu_bound_task));
|
||||
|
||||
let posting_downloader = posting_ranges_to_load_stream
|
||||
.map(|posting_slice| {
|
||||
self.postings_file_slice
|
||||
.read_bytes_slice_async(posting_slice)
|
||||
.map(|result| result.map(|_slice| ()))
|
||||
})
|
||||
.buffer_unordered(5)
|
||||
.try_collect::<Vec<()>>();
|
||||
|
||||
let (_, slices_downloaded) =
|
||||
futures_util::future::try_join(task_handle, posting_downloader).await?;
|
||||
|
||||
Ok(!slices_downloaded.is_empty())
|
||||
}
|
||||
|
||||
/// Warmup the block postings for all terms.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] or
|
||||
/// [`Self::warm_postings`] instead.
|
||||
pub async fn warm_postings_full(&self, with_positions: bool) -> io::Result<()> {
|
||||
self.postings_file_slice.read_bytes_async().await?;
|
||||
if with_positions {
|
||||
self.positions_file_slice.read_bytes_async().await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term asynchronously.
|
||||
pub async fn doc_freq_async(&self, term: &Term) -> io::Result<u32> {
|
||||
Ok(self
|
||||
.get_term_info_async(term)
|
||||
.await?
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0u32))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//!
|
||||
//! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s.
|
||||
|
||||
mod codec_configuration;
|
||||
mod index;
|
||||
mod index_meta;
|
||||
mod inverted_index_reader;
|
||||
@@ -11,14 +10,11 @@ mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::codec_configuration::CodecConfiguration;
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub(crate) use self::index_meta::SegmentMetaInventory;
|
||||
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
|
||||
pub use self::inverted_index_reader::{
|
||||
BoxedTermScorer, InvertedIndexFieldSpace, InvertedIndexReader, TantivyInvertedIndexReader,
|
||||
};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::{FieldMetadata, SegmentReader, TantivySegmentReader};
|
||||
pub use self::segment_reader::{FieldMetadata, SegmentReader};
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::SegmentComponent;
|
||||
use crate::codec::StandardCodec;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::{Directory, FileSlice, WritePtr};
|
||||
use crate::index::{Index, SegmentId, SegmentMeta};
|
||||
@@ -11,25 +10,25 @@ use crate::Opstamp;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
pub struct Segment<C: crate::codec::Codec = StandardCodec> {
|
||||
index: Index<C>,
|
||||
pub struct Segment {
|
||||
index: Index,
|
||||
meta: SegmentMeta,
|
||||
}
|
||||
|
||||
impl<C: crate::codec::Codec> fmt::Debug for Segment<C> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
impl fmt::Debug for Segment {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Segment({:?})", self.id().uuid_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: crate::codec::Codec> Segment<C> {
|
||||
impl Segment {
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
pub(crate) fn for_index(index: Index<C>, meta: SegmentMeta) -> Segment<C> {
|
||||
pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
}
|
||||
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index<C> {
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
@@ -47,7 +46,7 @@ impl<C: crate::codec::Codec> Segment<C> {
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub fn with_max_doc(self, max_doc: u32) -> Segment<C> {
|
||||
pub fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_max_doc(max_doc),
|
||||
@@ -56,7 +55,7 @@ impl<C: crate::codec::Codec> Segment<C> {
|
||||
|
||||
#[doc(hidden)]
|
||||
#[must_use]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment<C> {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
|
||||
@@ -44,7 +44,7 @@ fn create_uuid() -> Uuid {
|
||||
}
|
||||
|
||||
impl SegmentId {
|
||||
/// Generates a new random `SegmentId`.
|
||||
#[doc(hidden)]
|
||||
pub fn generate_random() -> SegmentId {
|
||||
SegmentId(create_uuid())
|
||||
}
|
||||
|
||||
@@ -6,107 +6,17 @@ use common::{ByteCount, HasLen};
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::codec::{ObjectSafeCodec, SumOrDoNothingCombiner};
|
||||
use crate::directory::{CompositeFile, Directory, FileSlice};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::index::{
|
||||
InvertedIndexReader, Segment, SegmentComponent, SegmentId, SegmentMeta,
|
||||
TantivyInvertedIndexReader,
|
||||
};
|
||||
use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::json_utils::json_path_sep_to_dot;
|
||||
use crate::query::Scorer;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, Type};
|
||||
use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::{StoreReader, TantivyStoreReader};
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::{DocId, Opstamp, Score};
|
||||
|
||||
/// Trait defining the contract for a segment reader.
|
||||
pub trait SegmentReader: Send + Sync {
|
||||
/// Returns the highest document id ever attributed in this segment + 1.
|
||||
fn max_doc(&self) -> DocId;
|
||||
|
||||
/// Returns the number of alive documents. Deleted documents are not counted.
|
||||
fn num_docs(&self) -> DocId;
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
fn schema(&self) -> &Schema;
|
||||
|
||||
/// Performs a for_each_pruning operation on the given scorer.
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: Score,
|
||||
scorer: Box<dyn Scorer>,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
);
|
||||
|
||||
/// Builds a union scorer possibly specialized if all scorers are term scorers.
|
||||
fn build_union_scorer_with_sum_combiner(
|
||||
&self,
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
num_docs: DocId,
|
||||
score_combiner_type: SumOrDoNothingCombiner,
|
||||
) -> Box<dyn Scorer>;
|
||||
|
||||
/// Return the number of documents that have been deleted in the segment.
|
||||
fn num_deleted_docs(&self) -> DocId;
|
||||
|
||||
/// Returns true if some of the documents of the segment have been deleted.
|
||||
fn has_deletes(&self) -> bool;
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
fn fast_fields(&self) -> &FastFieldReaders;
|
||||
|
||||
/// Accessor to the `FacetReader` associated with a given `Field`.
|
||||
fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
|
||||
let field = self.schema().get_field(field_name)?;
|
||||
let field_entry = self.schema().get_field_entry(field);
|
||||
if field_entry.field_type().value_type() != Type::Facet {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"`{field_name}` is not a facet field.`"
|
||||
)));
|
||||
}
|
||||
let Some(facet_column) = self.fast_fields().str(field_name)? else {
|
||||
panic!("Facet Field `{field_name}` is missing. This should not happen");
|
||||
};
|
||||
Ok(FacetReader::new(facet_column))
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader>;
|
||||
|
||||
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
|
||||
fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<Box<dyn StoreReader>>;
|
||||
|
||||
/// Returns a field reader associated with the field given in argument.
|
||||
fn inverted_index(&self, field: Field) -> crate::Result<Arc<dyn InvertedIndexReader>>;
|
||||
|
||||
/// Returns the list of fields that have been indexed in the segment.
|
||||
fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>>;
|
||||
|
||||
/// Returns the segment id.
|
||||
fn segment_id(&self) -> SegmentId;
|
||||
|
||||
/// Returns the delete opstamp.
|
||||
fn delete_opstamp(&self) -> Option<Opstamp>;
|
||||
|
||||
/// Returns the bitset representing the alive `DocId`s.
|
||||
fn alive_bitset(&self) -> Option<&AliveBitSet>;
|
||||
|
||||
/// Returns true if the `doc` is marked as deleted.
|
||||
fn is_deleted(&self, doc: DocId) -> bool;
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids.
|
||||
fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + Send + '_>;
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
fn space_usage(&self) -> io::Result<SegmentSpaceUsage>;
|
||||
|
||||
/// Clones this reader into a shared trait object.
|
||||
fn clone_arc(&self) -> Arc<dyn SegmentReader>;
|
||||
}
|
||||
use crate::{DocId, Opstamp};
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -119,8 +29,8 @@ pub trait SegmentReader: Send + Sync {
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
#[derive(Clone)]
|
||||
pub struct TantivySegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<dyn InvertedIndexReader>>>>,
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
delete_opstamp: Option<Opstamp>,
|
||||
@@ -137,148 +47,75 @@ pub struct TantivySegmentReader {
|
||||
store_file: FileSlice,
|
||||
alive_bitset_opt: Option<AliveBitSet>,
|
||||
schema: Schema,
|
||||
codec: Arc<dyn ObjectSafeCodec>,
|
||||
}
|
||||
|
||||
impl TantivySegmentReader {
|
||||
/// Open a new segment for reading.
|
||||
pub fn open<C: crate::codec::Codec>(
|
||||
segment: &Segment<C>,
|
||||
) -> crate::Result<Arc<dyn SegmentReader>> {
|
||||
Self::open_with_custom_alive_set(segment, None)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open_with_custom_alive_set<C: crate::codec::Codec>(
|
||||
segment: &Segment<C>,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<Arc<dyn SegmentReader>> {
|
||||
segment.index().codec().open_segment_reader(
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
custom_bitset,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn open_with_custom_alive_set_from_directory(
|
||||
directory: &dyn Directory,
|
||||
segment_meta: &SegmentMeta,
|
||||
schema: Schema,
|
||||
codec: Arc<dyn ObjectSafeCodec>,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<TantivySegmentReader> {
|
||||
let termdict_file =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::Terms))?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
let store_file =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::Store))?;
|
||||
|
||||
crate::fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::Postings))?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::Positions))
|
||||
{
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let fast_fields_data =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::FastFields))?;
|
||||
let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?;
|
||||
let fieldnorm_data =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::FieldNorms))?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let original_bitset = if segment_meta.has_deletes() {
|
||||
let alive_doc_file_slice =
|
||||
directory.open_read(&segment_meta.relative_path(SegmentComponent::Delete))?;
|
||||
let alive_doc_data = alive_doc_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(alive_doc_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
|
||||
|
||||
let max_doc = segment_meta.max_doc();
|
||||
let num_docs = alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
|
||||
.unwrap_or(max_doc);
|
||||
|
||||
Ok(TantivySegmentReader {
|
||||
inv_idx_reader_cache: Default::default(),
|
||||
num_docs,
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment_meta.id(),
|
||||
delete_opstamp: segment_meta.delete_opstamp(),
|
||||
store_file,
|
||||
alive_bitset_opt,
|
||||
positions_composite,
|
||||
schema,
|
||||
codec,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentReader for TantivySegmentReader {
|
||||
fn max_doc(&self) -> DocId {
|
||||
impl SegmentReader {
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
fn num_docs(&self) -> DocId {
|
||||
/// Returns the number of alive documents.
|
||||
/// Deleted documents are not counted.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
fn schema(&self) -> &Schema {
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: Score,
|
||||
scorer: Box<dyn Scorer>,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
self.codec.for_each_pruning(threshold, scorer, callback);
|
||||
}
|
||||
|
||||
fn build_union_scorer_with_sum_combiner(
|
||||
&self,
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
num_docs: DocId,
|
||||
score_combiner_type: SumOrDoNothingCombiner,
|
||||
) -> Box<dyn Scorer> {
|
||||
self.codec
|
||||
.build_union_scorer_with_sum_combiner(scorers, num_docs, score_combiner_type)
|
||||
}
|
||||
|
||||
fn num_deleted_docs(&self) -> DocId {
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.max_doc - self.num_docs
|
||||
}
|
||||
|
||||
fn has_deletes(&self) -> bool {
|
||||
self.num_docs != self.max_doc
|
||||
/// Returns true if some of the documents of the segment have been deleted.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
fn fast_fields(&self) -> &FastFieldReaders {
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
///
|
||||
/// Returns the u64 fast value reader if the field
|
||||
/// is a u64 field indexed as "fast".
|
||||
///
|
||||
/// Return a FastFieldNotAvailableError if the field is not
|
||||
/// declared as a fast field in the schema.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||
&self.fast_fields_readers
|
||||
}
|
||||
|
||||
fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
|
||||
/// Accessor to the `FacetReader` associated with a given `Field`.
|
||||
pub fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
|
||||
let schema = self.schema();
|
||||
let field = schema.get_field(field_name)?;
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
if field_entry.field_type().value_type() != Type::Facet {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"`{field_name}` is not a facet field.`"
|
||||
)));
|
||||
}
|
||||
let Some(facet_column) = self.fast_fields().str(field_name)? else {
|
||||
panic!("Facet Field `{field_name}` is missing. This should not happen");
|
||||
};
|
||||
Ok(FacetReader::new(facet_column))
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
///
|
||||
/// Field norms are the length (in tokens) of the fields.
|
||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
@@ -289,14 +126,100 @@ impl SegmentReader for TantivySegmentReader {
|
||||
})
|
||||
}
|
||||
|
||||
fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<Box<dyn StoreReader>> {
|
||||
Ok(Box::new(TantivyStoreReader::open(
|
||||
self.store_file.clone(),
|
||||
cache_num_blocks,
|
||||
)?))
|
||||
#[doc(hidden)]
|
||||
pub fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
||||
&self.fieldnorm_readers
|
||||
}
|
||||
|
||||
fn inverted_index(&self, field: Field) -> crate::Result<Arc<dyn InvertedIndexReader>> {
|
||||
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
|
||||
///
|
||||
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
|
||||
/// The size of blocks is configurable, this should be reflexted in the
|
||||
pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone(), cache_num_blocks)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||
Self::open_with_custom_alive_set(segment, None)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open_with_custom_alive_set(
|
||||
segment: &Segment,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<SegmentReader> {
|
||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
let store_file = segment.open_read(SegmentComponent::Store)?;
|
||||
|
||||
crate::fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?;
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let original_bitset = if segment.meta().has_deletes() {
|
||||
let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
||||
let alive_doc_data = alive_doc_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(alive_doc_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let num_docs = alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
|
||||
.unwrap_or(max_doc);
|
||||
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Default::default(),
|
||||
num_docs,
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
store_file,
|
||||
alive_bitset_opt,
|
||||
positions_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a field reader associated with the field given in argument.
|
||||
/// If the field was not present in the index during indexing time,
|
||||
/// the InvertedIndexReader is empty.
|
||||
///
|
||||
/// The field reader is in charge of iterating through the
|
||||
/// term dictionary associated with a specific field,
|
||||
/// and opening the posting list associated with any term.
|
||||
///
|
||||
/// If the field is not marked as index, a warning is logged and an empty `InvertedIndexReader`
|
||||
/// is returned.
|
||||
/// Similarly, if the field is marked as indexed but no term has been indexed for the given
|
||||
/// index, an empty `InvertedIndexReader` is returned (but no warning is logged).
|
||||
pub fn inverted_index(&self, field: Field) -> crate::Result<Arc<InvertedIndexReader>> {
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
.read()
|
||||
@@ -321,9 +244,7 @@ impl SegmentReader for TantivySegmentReader {
|
||||
//
|
||||
// Returns an empty inverted index.
|
||||
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
|
||||
let inv_idx_reader: Arc<dyn InvertedIndexReader> =
|
||||
Arc::new(TantivyInvertedIndexReader::empty(record_option));
|
||||
return Ok(inv_idx_reader);
|
||||
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
|
||||
}
|
||||
|
||||
let record_option = record_option_opt.unwrap();
|
||||
@@ -346,20 +267,13 @@ impl SegmentReader for TantivySegmentReader {
|
||||
);
|
||||
DataCorruption::comment_only(error_msg)
|
||||
})?;
|
||||
let fieldnorms_file = self
|
||||
.fieldnorm_readers
|
||||
.get_inner_file()
|
||||
.open_read(field)
|
||||
.unwrap_or_else(FileSlice::empty);
|
||||
|
||||
let inv_idx_reader: Arc<dyn InvertedIndexReader> =
|
||||
Arc::new(TantivyInvertedIndexReader::new(
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
positions_file,
|
||||
fieldnorms_file,
|
||||
record_option,
|
||||
)?);
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
positions_file,
|
||||
record_option,
|
||||
)?);
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
// twice, but this is fine.
|
||||
@@ -371,10 +285,23 @@ impl SegmentReader for TantivySegmentReader {
|
||||
Ok(inv_idx_reader)
|
||||
}
|
||||
|
||||
fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
|
||||
/// Returns the list of fields that have been indexed in the segment.
|
||||
/// The field list includes the field defined in the schema as well as the fields
|
||||
/// that have been indexed as a part of a JSON field.
|
||||
/// The returned field name is the full field name, including the name of the JSON field.
|
||||
///
|
||||
/// The returned field names can be used in queries.
|
||||
///
|
||||
/// Notice: If your data contains JSON fields this is **very expensive**, as it requires
|
||||
/// browsing through the inverted index term dictionary and the columnar field dictionary.
|
||||
///
|
||||
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
|
||||
/// field that is not indexed nor a fast field but is stored, it is possible for the field
|
||||
/// to not be listed.
|
||||
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
|
||||
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
|
||||
let mut map_to_canonical = FnvHashMap::default();
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
for (field, field_entry) in self.schema().fields() {
|
||||
let field_name = field_entry.name().to_string();
|
||||
let is_indexed = field_entry.is_indexed();
|
||||
if is_indexed {
|
||||
@@ -464,7 +391,7 @@ impl SegmentReader for TantivySegmentReader {
|
||||
}
|
||||
}
|
||||
let fast_fields: Vec<FieldMetadata> = self
|
||||
.fast_fields_readers
|
||||
.fast_fields()
|
||||
.columnar()
|
||||
.iter_columns()?
|
||||
.map(|(mut field_name, handle)| {
|
||||
@@ -492,26 +419,31 @@ impl SegmentReader for TantivySegmentReader {
|
||||
Ok(merged_field_metadatas)
|
||||
}
|
||||
|
||||
fn segment_id(&self) -> SegmentId {
|
||||
/// Returns the segment id
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.segment_id
|
||||
}
|
||||
|
||||
fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
/// Returns the delete opstamp
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.delete_opstamp
|
||||
}
|
||||
|
||||
fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
/// Returns the bitset representing the alive `DocId`s.
|
||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
self.alive_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.alive_bitset_opt
|
||||
.as_ref()
|
||||
/// Returns true if the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.alive_bitset()
|
||||
.map(|alive_bitset| alive_bitset.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + Send + '_> {
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + Send + '_> {
|
||||
if let Some(alive_bitset) = &self.alive_bitset_opt {
|
||||
Box::new(alive_bitset.iter_alive())
|
||||
} else {
|
||||
@@ -519,25 +451,22 @@ impl SegmentReader for TantivySegmentReader {
|
||||
}
|
||||
}
|
||||
|
||||
fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
|
||||
/// Summarize total space usage of this segment.
|
||||
pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
|
||||
Ok(SegmentSpaceUsage::new(
|
||||
self.num_docs,
|
||||
self.termdict_composite.space_usage(&self.schema),
|
||||
self.postings_composite.space_usage(&self.schema),
|
||||
self.positions_composite.space_usage(&self.schema),
|
||||
self.num_docs(),
|
||||
self.termdict_composite.space_usage(self.schema()),
|
||||
self.postings_composite.space_usage(self.schema()),
|
||||
self.positions_composite.space_usage(self.schema()),
|
||||
self.fast_fields_readers.space_usage()?,
|
||||
self.fieldnorm_readers.space_usage(&self.schema),
|
||||
TantivyStoreReader::open(self.store_file.clone(), 0)?.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(self.schema()),
|
||||
self.get_store_reader(0)?.space_usage(),
|
||||
self.alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(AliveBitSet::space_usage)
|
||||
.unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
fn clone_arc(&self) -> Arc<dyn SegmentReader> {
|
||||
Arc::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -647,7 +576,7 @@ fn intersect_alive_bitset(
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for TantivySegmentReader {
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
}
|
||||
|
||||
@@ -250,15 +250,11 @@ mod tests {
|
||||
|
||||
struct DummyWeight;
|
||||
impl Weight for DummyWeight {
|
||||
fn scorer(
|
||||
&self,
|
||||
_reader: &dyn SegmentReader,
|
||||
_boost: Score,
|
||||
) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
|
||||
}
|
||||
|
||||
fn explain(&self, _reader: &dyn SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
|
||||
Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use smallvec::smallvec;
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit};
|
||||
use crate::codec::{Codec, StandardCodec};
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite};
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_alive_bitset;
|
||||
@@ -69,12 +68,12 @@ pub struct IndexWriterOptions {
|
||||
/// indexing queue.
|
||||
/// Each indexing thread builds its own independent [`Segment`], via
|
||||
/// a `SegmentWriter` object.
|
||||
pub struct IndexWriter<C: Codec = StandardCodec, D: Document = TantivyDocument> {
|
||||
pub struct IndexWriter<D: Document = TantivyDocument> {
|
||||
// the lock is just used to bind the
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: Option<DirectoryLock>,
|
||||
|
||||
index: Index<C>,
|
||||
index: Index,
|
||||
|
||||
options: IndexWriterOptions,
|
||||
|
||||
@@ -83,7 +82,7 @@ pub struct IndexWriter<C: Codec = StandardCodec, D: Document = TantivyDocument>
|
||||
index_writer_status: IndexWriterStatus<D>,
|
||||
operation_sender: AddBatchSender<D>,
|
||||
|
||||
segment_updater: SegmentUpdater<C>,
|
||||
segment_updater: SegmentUpdater,
|
||||
|
||||
worker_id: usize,
|
||||
|
||||
@@ -95,7 +94,7 @@ pub struct IndexWriter<C: Codec = StandardCodec, D: Document = TantivyDocument>
|
||||
|
||||
fn compute_deleted_bitset(
|
||||
alive_bitset: &mut BitSet,
|
||||
segment_reader: &dyn SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: Opstamp,
|
||||
@@ -129,8 +128,8 @@ fn compute_deleted_bitset(
|
||||
/// is `==` target_opstamp.
|
||||
/// For instance, there was no delete operation between the state of the `segment_entry` and
|
||||
/// the `target_opstamp`, `segment_entry` is not updated.
|
||||
pub fn advance_deletes<C: Codec>(
|
||||
mut segment: Segment<C>,
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: Opstamp,
|
||||
) -> crate::Result<()> {
|
||||
@@ -144,12 +143,7 @@ pub fn advance_deletes<C: Codec>(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let segment_reader = segment.index().codec().open_segment_reader(
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
None,
|
||||
)?;
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
|
||||
@@ -161,7 +155,7 @@ pub fn advance_deletes<C: Codec>(
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut alive_bitset,
|
||||
segment_reader.as_ref(),
|
||||
&segment_reader,
|
||||
segment_entry.delete_cursor(),
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
@@ -185,11 +179,11 @@ pub fn advance_deletes<C: Codec>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_documents<C: crate::codec::Codec, D: Document>(
|
||||
fn index_documents<D: Document>(
|
||||
memory_budget: usize,
|
||||
segment: Segment<C>,
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
|
||||
segment_updater: &SegmentUpdater<C>,
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<()> {
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
|
||||
@@ -232,8 +226,8 @@ fn index_documents<C: crate::codec::Codec, D: Document>(
|
||||
}
|
||||
|
||||
/// `doc_opstamps` is required to be non-empty.
|
||||
fn apply_deletes<C: crate::codec::Codec>(
|
||||
segment: &Segment<C>,
|
||||
fn apply_deletes(
|
||||
segment: &Segment,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
@@ -249,19 +243,14 @@ fn apply_deletes<C: crate::codec::Codec>(
|
||||
.max()
|
||||
.expect("Empty DocOpstamp is forbidden");
|
||||
|
||||
let segment_reader = segment.index().codec().open_segment_reader(
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
None,
|
||||
)?;
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
segment_reader.as_ref(),
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
max_doc_opstamp,
|
||||
@@ -273,7 +262,7 @@ fn apply_deletes<C: crate::codec::Codec>(
|
||||
})
|
||||
}
|
||||
|
||||
impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
impl<D: Document> IndexWriter<D> {
|
||||
/// Create a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -289,7 +278,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
/// If the memory arena per thread is too small or too big, returns
|
||||
/// `TantivyError::InvalidArgument`
|
||||
pub(crate) fn new(
|
||||
index: &Index<C>,
|
||||
index: &Index,
|
||||
options: IndexWriterOptions,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> crate::Result<Self> {
|
||||
@@ -356,7 +345,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
}
|
||||
|
||||
/// Accessor to the index.
|
||||
pub fn index(&self) -> &Index<C> {
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
@@ -404,7 +393,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
/// It is safe to start writing file associated with the new `Segment`.
|
||||
/// These will not be garbage collected as long as an instance object of
|
||||
/// `SegmentMeta` object associated with the new `Segment` is "alive".
|
||||
pub fn new_segment(&self) -> Segment<C> {
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
self.index.new_segment()
|
||||
}
|
||||
|
||||
@@ -626,7 +615,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`].
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, C, D>> {
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
@@ -676,7 +665,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater<C> {
|
||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||
&self.segment_updater
|
||||
}
|
||||
|
||||
@@ -815,7 +804,7 @@ impl<C: Codec, D: Document> IndexWriter<C, D> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: Codec, D: Document> Drop for IndexWriter<C, D> {
|
||||
impl<D: Document> Drop for IndexWriter<D> {
|
||||
fn drop(&mut self) {
|
||||
self.segment_updater.kill();
|
||||
self.drop_sender();
|
||||
@@ -1976,9 +1965,9 @@ mod tests {
|
||||
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
|
||||
.unwrap();
|
||||
// test store iterator
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
let doc = store_reader.get(doc_id).unwrap();
|
||||
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
|
||||
let id = doc
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
.as_value()
|
||||
@@ -1989,7 +1978,7 @@ mod tests {
|
||||
// test store random access
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
let id = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
@@ -1998,7 +1987,7 @@ mod tests {
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&id));
|
||||
if id_is_full_doc(id) {
|
||||
let id2 = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(multi_numbers)
|
||||
.unwrap()
|
||||
@@ -2006,13 +1995,13 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(id, id2);
|
||||
let bool = store_reader
|
||||
.get(doc_id)
|
||||
.get::<TantivyDocument>(doc_id)
|
||||
.unwrap()
|
||||
.get_first(bool_field)
|
||||
.unwrap()
|
||||
.as_bool()
|
||||
.unwrap();
|
||||
let doc = store_reader.get(doc_id).unwrap();
|
||||
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap();
|
||||
let mut bool2 = doc.get_all(multi_bools);
|
||||
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
|
||||
@@ -94,7 +94,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
let size_sorted_segments = segments
|
||||
.iter()
|
||||
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.filter(|seg| (seg.num_docs() as usize) <= self.max_docs_before_merge)
|
||||
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
|
||||
.collect::<Vec<&SegmentMeta>>();
|
||||
|
||||
@@ -372,4 +372,21 @@ mod tests {
|
||||
assert_eq!(merge_candidates[0].0.len(), 1);
|
||||
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_docs_before_merge_large_value() {
|
||||
// Regression test: (max_docs_before_merge as u32) truncates values > u32::MAX.
|
||||
// Casting num_docs() to usize instead avoids the truncation.
|
||||
let mut policy = LogMergePolicy::default();
|
||||
policy.set_min_num_segments(2);
|
||||
policy.set_max_docs_before_merge(5_000_000_000usize);
|
||||
let test_input = vec![
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(100_000),
|
||||
];
|
||||
let result = policy.compute_merge_candidates(&test_input);
|
||||
// Both segments should be eligible (100_000 < 5_000_000_000)
|
||||
assert_eq!(result.len(), 1);
|
||||
assert_eq!(result[0].0.len(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::Index;
|
||||
use crate::postings::{DocFreq, Postings};
|
||||
use crate::postings::Postings;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
@@ -121,32 +121,21 @@ mod tests {
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let term_info = inverted_index.get_term_info(&term_a).unwrap().unwrap();
|
||||
let typed_postings = crate::codec::Codec::load_postings_typed(
|
||||
index.codec(),
|
||||
inverted_index.as_ref(),
|
||||
&term_info,
|
||||
IndexRecordOption::WithFreqsAndPositions,
|
||||
)
|
||||
.unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
crate::indexer::merger::doc_freq_given_deletes(
|
||||
&typed_postings,
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
let mut postings = inverted_index
|
||||
.read_postings_from_terminfo(&term_info, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), DocFreq::Exact(2));
|
||||
let mut postings = inverted_index
|
||||
.read_postings_from_terminfo(&term_info, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = Vec::new();
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{
|
||||
@@ -9,8 +7,6 @@ use common::ReadOnlyBitSet;
|
||||
use itertools::Itertools;
|
||||
use measure_time::debug_time;
|
||||
|
||||
use crate::codec::postings::PostingsCodec;
|
||||
use crate::codec::{Codec, StandardCodec};
|
||||
use crate::directory::WritePtr;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
@@ -19,8 +15,8 @@ use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer,
|
||||
use crate::index::{Segment, SegmentComponent, SegmentReader};
|
||||
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::postings::{InvertedIndexSerializer, Postings, TermInfo};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, IndexRecordOption, Schema};
|
||||
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
|
||||
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::{TermMerger, TermOrdinal};
|
||||
use crate::{DocAddress, DocId, InvertedIndexReader};
|
||||
@@ -31,7 +27,7 @@ use crate::{DocAddress, DocId, InvertedIndexReader};
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
fn estimate_total_num_tokens_in_single_segment(
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> crate::Result<u64> {
|
||||
// There are no deletes. We can simply use the exact value saved into the posting list.
|
||||
@@ -43,7 +39,7 @@ fn estimate_total_num_tokens_in_single_segment(
|
||||
|
||||
// When there are deletes, we use an approximation either
|
||||
// by using the fieldnorm.
|
||||
if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) {
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
||||
let mut count: [usize; 256] = [0; 256];
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
@@ -72,23 +68,18 @@ fn estimate_total_num_tokens_in_single_segment(
|
||||
Ok((segment_num_tokens as f64 * ratio) as u64)
|
||||
}
|
||||
|
||||
fn estimate_total_num_tokens(
|
||||
readers: &[Arc<dyn SegmentReader>],
|
||||
field: Field,
|
||||
) -> crate::Result<u64> {
|
||||
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
|
||||
let mut total_num_tokens: u64 = 0;
|
||||
for reader in readers {
|
||||
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader.as_ref(), field)?;
|
||||
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
|
||||
}
|
||||
Ok(total_num_tokens)
|
||||
}
|
||||
|
||||
pub struct IndexMerger<C: Codec = StandardCodec> {
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
pub(crate) readers: Vec<Arc<dyn SegmentReader>>,
|
||||
pub(crate) readers: Vec<SegmentReader>,
|
||||
max_doc: u32,
|
||||
codec: C,
|
||||
phantom: PhantomData<C>,
|
||||
}
|
||||
|
||||
struct DeltaComputer {
|
||||
@@ -153,8 +144,8 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
|
||||
.collect()
|
||||
}
|
||||
|
||||
impl<C: Codec> IndexMerger<C> {
|
||||
pub fn open(schema: Schema, segments: &[Segment<C>]) -> crate::Result<IndexMerger<C>> {
|
||||
impl IndexMerger {
|
||||
pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
|
||||
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, segments, alive_bitset)
|
||||
}
|
||||
@@ -171,24 +162,16 @@ impl<C: Codec> IndexMerger<C> {
|
||||
// This can be used to merge but also apply an additional filter.
|
||||
// One use case is demux, which is basically taking a list of
|
||||
// segments and partitions them e.g. by a value in a field.
|
||||
//
|
||||
// # Panics if segments is empty.
|
||||
pub fn open_with_custom_alive_set(
|
||||
schema: Schema,
|
||||
segments: &[Segment<C>],
|
||||
segments: &[Segment],
|
||||
alive_bitset_opt: Vec<Option<AliveBitSet>>,
|
||||
) -> crate::Result<IndexMerger<C>> {
|
||||
assert!(!segments.is_empty());
|
||||
let codec = segments[0].index().codec().clone();
|
||||
) -> crate::Result<IndexMerger> {
|
||||
let mut readers = vec![];
|
||||
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) {
|
||||
if segment.meta().num_docs() > 0 {
|
||||
let reader = segment.index().codec().open_segment_reader(
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
new_alive_bitset_opt,
|
||||
)?;
|
||||
let reader =
|
||||
SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?;
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
@@ -206,8 +189,6 @@ impl<C: Codec> IndexMerger<C> {
|
||||
schema,
|
||||
readers,
|
||||
max_doc,
|
||||
codec,
|
||||
phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -281,7 +262,7 @@ impl<C: Codec> IndexMerger<C> {
|
||||
}),
|
||||
);
|
||||
|
||||
let has_deletes: bool = self.readers.iter().any(|reader| reader.has_deletes());
|
||||
let has_deletes: bool = self.readers.iter().any(SegmentReader::has_deletes);
|
||||
let mapping_type = if has_deletes {
|
||||
MappingType::StackedWithDeletes
|
||||
} else {
|
||||
@@ -316,7 +297,7 @@ impl<C: Codec> IndexMerger<C> {
|
||||
|
||||
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
|
||||
|
||||
let field_readers: Vec<Arc<dyn InvertedIndexReader>> = self
|
||||
let field_readers: Vec<Arc<InvertedIndexReader>> = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.inverted_index(indexed_field))
|
||||
@@ -374,10 +355,7 @@ impl<C: Codec> IndexMerger<C> {
|
||||
indexed. Have you modified the schema?",
|
||||
);
|
||||
|
||||
let mut segment_postings_containing_the_term: Vec<(
|
||||
usize,
|
||||
<C::PostingsCodec as PostingsCodec>::Postings,
|
||||
)> = Vec::with_capacity(self.readers.len());
|
||||
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
|
||||
|
||||
while merged_terms.advance() {
|
||||
segment_postings_containing_the_term.clear();
|
||||
@@ -388,16 +366,18 @@ impl<C: Codec> IndexMerger<C> {
|
||||
// Let's compute the list of non-empty posting lists
|
||||
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
|
||||
let segment_reader = &self.readers[segment_ord];
|
||||
let inverted_index = &field_readers[segment_ord];
|
||||
if let Some((doc_freq, postings)) = postings_for_merge::<C>(
|
||||
inverted_index.as_ref(),
|
||||
&self.codec,
|
||||
&term_info,
|
||||
segment_postings_option,
|
||||
segment_reader.alive_bitset(),
|
||||
)? {
|
||||
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
|
||||
let segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
|
||||
let alive_bitset_opt = segment_reader.alive_bitset();
|
||||
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
|
||||
segment_postings.doc_freq_given_deletes(alive_bitset)
|
||||
} else {
|
||||
segment_postings.doc_freq()
|
||||
};
|
||||
if doc_freq > 0u32 {
|
||||
total_doc_freq += doc_freq;
|
||||
segment_postings_containing_the_term.push((segment_ord, postings));
|
||||
segment_postings_containing_the_term.push((segment_ord, segment_postings));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,7 +395,11 @@ impl<C: Codec> IndexMerger<C> {
|
||||
assert!(!segment_postings_containing_the_term.is_empty());
|
||||
|
||||
let has_term_freq = {
|
||||
let has_term_freq = segment_postings_containing_the_term[0].1.has_freq();
|
||||
let has_term_freq = !segment_postings_containing_the_term[0]
|
||||
.1
|
||||
.block_cursor
|
||||
.freqs()
|
||||
.is_empty();
|
||||
for (_, postings) in &segment_postings_containing_the_term[1..] {
|
||||
// This may look at a strange way to test whether we have term freq or not.
|
||||
// With JSON object, the schema is not sufficient to know whether a term
|
||||
@@ -431,7 +415,7 @@ impl<C: Codec> IndexMerger<C> {
|
||||
//
|
||||
// Overall the reliable way to know if we have actual frequencies loaded or not
|
||||
// is to check whether the actual decoded array is empty or not.
|
||||
if postings.has_freq() != has_term_freq {
|
||||
if has_term_freq == postings.block_cursor.freqs().is_empty() {
|
||||
return Err(DataCorruption::comment_only(
|
||||
"Term freqs are inconsistent across segments",
|
||||
)
|
||||
@@ -506,7 +490,33 @@ impl<C: Codec> IndexMerger<C> {
|
||||
debug_time!("write-storable-fields");
|
||||
debug!("write-storable-field");
|
||||
|
||||
store_writer.merge_segment_readers(&self.readers)?;
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader(1)?;
|
||||
if reader.has_deletes()
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
|
||||
// => 5 * full blocks, 2 * 1 document blocks
|
||||
//
|
||||
// In a more realistic scenario the segments are of the same size, so 1/6 of
|
||||
// the doc stores would be on average half full, given total randomness (which
|
||||
// is not the case here, but not sure how it behaves exactly).
|
||||
//
|
||||
// https://github.com/quickwit-oss/tantivy/issues/1053
|
||||
//
|
||||
// take 7 in order to not walk over all checkpoints.
|
||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||
|| store_reader.decompressor() != store_writer.compressor().into()
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
store_writer.stack(store_reader)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -515,7 +525,7 @@ impl<C: Codec> IndexMerger<C> {
|
||||
///
|
||||
/// # Returns
|
||||
/// The number of documents in the resulting segment.
|
||||
pub fn write(&self, mut serializer: SegmentSerializer<C>) -> crate::Result<u32> {
|
||||
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||
let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
|
||||
debug!("write-fieldnorms");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
@@ -543,77 +553,6 @@ impl<C: Codec> IndexMerger<C> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the number of non-deleted documents.
|
||||
///
|
||||
/// This method will clone and scan through the posting lists.
|
||||
/// (this is a rather expensive operation).
|
||||
pub(crate) fn doc_freq_given_deletes<P: Postings + Clone>(
|
||||
postings: &P,
|
||||
alive_bitset: &AliveBitSet,
|
||||
) -> u32 {
|
||||
let mut postings = postings.clone();
|
||||
let mut doc_freq = 0;
|
||||
loop {
|
||||
let doc = postings.doc();
|
||||
if doc == TERMINATED {
|
||||
return doc_freq;
|
||||
}
|
||||
if alive_bitset.is_alive(doc) {
|
||||
doc_freq += 1u32;
|
||||
}
|
||||
postings.advance();
|
||||
}
|
||||
}
|
||||
|
||||
fn read_postings_for_merge<C: Codec>(
|
||||
inverted_index: &dyn InvertedIndexReader,
|
||||
codec: &C,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<<C::PostingsCodec as PostingsCodec>::Postings> {
|
||||
codec.load_postings_typed(inverted_index, term_info, option)
|
||||
}
|
||||
|
||||
fn postings_for_merge<C: Codec>(
|
||||
inverted_index: &dyn InvertedIndexReader,
|
||||
codec: &C,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
alive_bitset_opt: Option<&AliveBitSet>,
|
||||
) -> io::Result<Option<(u32, <C::PostingsCodec as PostingsCodec>::Postings)>> {
|
||||
let postings = read_postings_for_merge(inverted_index, codec, term_info, option)?;
|
||||
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
|
||||
doc_freq_given_deletes(&postings, alive_bitset)
|
||||
} else {
|
||||
// We do not need an exact document frequency here.
|
||||
match postings.doc_freq() {
|
||||
crate::postings::DocFreq::Exact(doc_freq) => doc_freq,
|
||||
crate::postings::DocFreq::Approximate(_) => exact_doc_freq(&postings),
|
||||
}
|
||||
};
|
||||
|
||||
if doc_freq == 0u32 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some((doc_freq, postings)))
|
||||
}
|
||||
|
||||
/// If the postings is not able to inform us of the document frequency,
|
||||
/// we just scan through it.
|
||||
pub(crate) fn exact_doc_freq<P: Postings + Clone>(postings: &P) -> u32 {
|
||||
let mut postings = postings.clone();
|
||||
let mut doc_freq = 0;
|
||||
loop {
|
||||
let doc = postings.doc();
|
||||
if doc == TERMINATED {
|
||||
return doc_freq;
|
||||
}
|
||||
doc_freq += 1u32;
|
||||
postings.advance();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -622,16 +561,12 @@ mod tests {
|
||||
use proptest::strategy::Strategy;
|
||||
use schema::FAST;
|
||||
|
||||
use crate::codec::postings::PostingsCodec;
|
||||
use crate::codec::standard::postings::StandardPostingsCodec;
|
||||
use crate::collector::tests::{
|
||||
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
|
||||
};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::{Index, SegmentId};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::postings::{DocFreq, Postings as _};
|
||||
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
|
||||
use crate::schema::{
|
||||
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
|
||||
@@ -746,32 +681,32 @@ mod tests {
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_value().as_str(),
|
||||
Some("af b")
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 1))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_value().as_str(),
|
||||
Some("a b c")
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_value().as_str(),
|
||||
Some("a b c d")
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 3))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g"));
|
||||
}
|
||||
|
||||
@@ -1583,10 +1518,10 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let mut term_scorer = term_query
|
||||
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
|
||||
.term_scorer_for_test(searcher.segment_reader(0u32), 1.0)
|
||||
.term_scorer_for_test(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(term_scorer.doc(), 0);
|
||||
assert_nearly_equals!(term_scorer.seek_block_max(0), 0.0079681855);
|
||||
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
|
||||
assert_nearly_equals!(term_scorer.score(), 0.0079681855);
|
||||
for _ in 0..81 {
|
||||
writer.add_document(doc!(text=>"hello happy tax payer"))?;
|
||||
@@ -1599,13 +1534,13 @@ mod tests {
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
let mut term_scorer = term_query
|
||||
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
|
||||
.term_scorer_for_test(segment_reader.as_ref(), 1.0)
|
||||
.term_scorer_for_test(segment_reader, 1.0)?
|
||||
.unwrap();
|
||||
// the difference compared to before is intrinsic to the bm25 formula. no worries
|
||||
// there.
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
assert_eq!(term_scorer.doc(), doc);
|
||||
assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312);
|
||||
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
|
||||
assert_nearly_equals!(term_scorer.score(), 0.003478312);
|
||||
term_scorer.advance();
|
||||
}
|
||||
@@ -1625,12 +1560,12 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let mut term_scorer = term_query
|
||||
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
|
||||
.term_scorer_for_test(segment_reader, 1.0)
|
||||
.term_scorer_for_test(segment_reader, 1.0)?
|
||||
.unwrap();
|
||||
// the difference compared to before is intrinsic to the bm25 formula. no worries there.
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
assert_eq!(term_scorer.doc(), doc);
|
||||
assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312);
|
||||
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
|
||||
assert_nearly_equals!(term_scorer.score(), 0.003478312);
|
||||
term_scorer.advance();
|
||||
}
|
||||
@@ -1644,18 +1579,4 @@ mod tests {
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_freq_given_delete() {
|
||||
let docs =
|
||||
<StandardPostingsCodec as PostingsCodec>::Postings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(docs.doc_freq(), DocFreq::Exact(3));
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
|
||||
assert_eq!(super::doc_freq_given_deletes(&docs, &alive_bitset), 2);
|
||||
let all_deleted =
|
||||
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
let docs =
|
||||
<StandardPostingsCodec as PostingsCodec>::Postings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(super::doc_freq_given_deletes(&docs, &all_deleted), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,16 @@
|
||||
use super::IndexWriter;
|
||||
use crate::codec::Codec;
|
||||
use crate::schema::document::Document;
|
||||
use crate::{FutureResult, Opstamp, TantivyDocument};
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a, C: Codec, D: Document = TantivyDocument> {
|
||||
index_writer: &'a mut IndexWriter<C, D>,
|
||||
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
|
||||
index_writer: &'a mut IndexWriter<D>,
|
||||
payload: Option<String>,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl<'a, C: Codec, D: Document> PreparedCommit<'a, C, D> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter<C, D>, opstamp: Opstamp) -> Self {
|
||||
impl<'a, D: Document> PreparedCommit<'a, D> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
|
||||
Self {
|
||||
index_writer,
|
||||
payload: None,
|
||||
|
||||
@@ -8,17 +8,17 @@ use crate::store::StoreWriter;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
pub struct SegmentSerializer<C: crate::codec::Codec> {
|
||||
segment: Segment<C>,
|
||||
pub struct SegmentSerializer {
|
||||
segment: Segment,
|
||||
pub(crate) store_writer: StoreWriter,
|
||||
fast_field_write: WritePtr,
|
||||
fieldnorms_serializer: Option<FieldNormsSerializer>,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
|
||||
impl<C: crate::codec::Codec> SegmentSerializer<C> {
|
||||
impl SegmentSerializer {
|
||||
/// Creates a new `SegmentSerializer`.
|
||||
pub fn for_segment(mut segment: Segment<C>) -> crate::Result<SegmentSerializer<C>> {
|
||||
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
|
||||
let settings = segment.index().settings().clone();
|
||||
let store_writer = {
|
||||
let store_write = segment.open_write(SegmentComponent::Store)?;
|
||||
@@ -50,7 +50,7 @@ impl<C: crate::codec::Codec> SegmentSerializer<C> {
|
||||
self.store_writer.mem_usage()
|
||||
}
|
||||
|
||||
pub fn segment(&self) -> &Segment<C> {
|
||||
pub fn segment(&self) -> &Segment {
|
||||
&self.segment
|
||||
}
|
||||
|
||||
|
||||
@@ -10,13 +10,10 @@ use std::sync::{Arc, RwLock};
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
use crate::codec::Codec;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::{
|
||||
CodecConfiguration, Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta,
|
||||
};
|
||||
use crate::index::{Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta};
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::index_writer::advance_deletes;
|
||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
@@ -64,10 +61,10 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
|
||||
// We voluntarily pass a merge_operation ref to guarantee that
|
||||
// the merge_operation is alive during the process
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SegmentUpdater<C: Codec>(Arc<InnerSegmentUpdater<C>>);
|
||||
pub(crate) struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
impl<C: Codec> Deref for SegmentUpdater<C> {
|
||||
type Target = InnerSegmentUpdater<C>;
|
||||
impl Deref for SegmentUpdater {
|
||||
type Target = InnerSegmentUpdater;
|
||||
|
||||
#[inline]
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -75,8 +72,8 @@ impl<C: Codec> Deref for SegmentUpdater<C> {
|
||||
}
|
||||
}
|
||||
|
||||
fn garbage_collect_files<C: Codec>(
|
||||
segment_updater: SegmentUpdater<C>,
|
||||
fn garbage_collect_files(
|
||||
segment_updater: SegmentUpdater,
|
||||
) -> crate::Result<GarbageCollectionResult> {
|
||||
info!("Running garbage collection");
|
||||
let mut index = segment_updater.index.clone();
|
||||
@@ -87,8 +84,8 @@ fn garbage_collect_files<C: Codec>(
|
||||
|
||||
/// Merges a list of segments the list of segment givens in the `segment_entries`.
|
||||
/// This function happens in the calling thread and is computationally expensive.
|
||||
fn merge<Codec: crate::codec::Codec>(
|
||||
index: &Index<Codec>,
|
||||
fn merge(
|
||||
index: &Index,
|
||||
mut segment_entries: Vec<SegmentEntry>,
|
||||
target_opstamp: Opstamp,
|
||||
) -> crate::Result<Option<SegmentEntry>> {
|
||||
@@ -111,13 +108,13 @@ fn merge<Codec: crate::codec::Codec>(
|
||||
|
||||
let delete_cursor = segment_entries[0].delete_cursor().clone();
|
||||
|
||||
let segments: Vec<Segment<Codec>> = segment_entries
|
||||
let segments: Vec<Segment> = segment_entries
|
||||
.iter()
|
||||
.map(|segment_entry| index.segment(segment_entry.meta().clone()))
|
||||
.collect();
|
||||
|
||||
// An IndexMerger is like a "view" of our merged segments.
|
||||
let merger: IndexMerger<Codec> = IndexMerger::open(index.schema(), &segments[..])?;
|
||||
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
|
||||
|
||||
// ... we just serialize this index merger in our new segment to merge the segments.
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
|
||||
@@ -142,10 +139,10 @@ fn merge<Codec: crate::codec::Codec>(
|
||||
/// meant to work if you have an `IndexWriter` running for the origin indices, or
|
||||
/// the destination `Index`.
|
||||
#[doc(hidden)]
|
||||
pub fn merge_indices<Codec: crate::codec::Codec>(
|
||||
indices: &[Index<Codec>],
|
||||
output_directory: Box<dyn Directory>,
|
||||
) -> crate::Result<Index<Codec>> {
|
||||
pub fn merge_indices<T: Into<Box<dyn Directory>>>(
|
||||
indices: &[Index],
|
||||
output_directory: T,
|
||||
) -> crate::Result<Index> {
|
||||
if indices.is_empty() {
|
||||
// If there are no indices to merge, there is no need to do anything.
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
@@ -166,7 +163,7 @@ pub fn merge_indices<Codec: crate::codec::Codec>(
|
||||
));
|
||||
}
|
||||
|
||||
let mut segments: Vec<Segment<Codec>> = Vec::new();
|
||||
let mut segments: Vec<Segment> = Vec::new();
|
||||
for index in indices {
|
||||
segments.extend(index.searchable_segments()?);
|
||||
}
|
||||
@@ -188,12 +185,12 @@ pub fn merge_indices<Codec: crate::codec::Codec>(
|
||||
/// meant to work if you have an `IndexWriter` running for the origin indices, or
|
||||
/// the destination `Index`.
|
||||
#[doc(hidden)]
|
||||
pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory>>>(
|
||||
segments: &[Segment<C>],
|
||||
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
||||
segments: &[Segment],
|
||||
target_settings: IndexSettings,
|
||||
filter_doc_ids: Vec<Option<AliveBitSet>>,
|
||||
output_directory: T,
|
||||
) -> crate::Result<Index<C>> {
|
||||
) -> crate::Result<Index> {
|
||||
if segments.is_empty() {
|
||||
// If there are no indices to merge, there is no need to do anything.
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
@@ -214,15 +211,14 @@ pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory
|
||||
));
|
||||
}
|
||||
|
||||
let mut merged_index: Index<C> = Index::builder()
|
||||
.schema(target_schema.clone())
|
||||
.codec(segments[0].index().codec().clone())
|
||||
.settings(target_settings.clone())
|
||||
.create(output_directory.into())?;
|
||||
|
||||
let mut merged_index = Index::create(
|
||||
output_directory,
|
||||
target_schema.clone(),
|
||||
target_settings.clone(),
|
||||
)?;
|
||||
let merged_segment = merged_index.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merger: IndexMerger<C> =
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
@@ -239,7 +235,6 @@ pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory
|
||||
))
|
||||
.trim_end()
|
||||
);
|
||||
let codec_configuration = CodecConfiguration::from(segments[0].index().codec());
|
||||
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: target_settings, // index_settings of all segments should be the same
|
||||
@@ -247,7 +242,6 @@ pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory
|
||||
schema: target_schema,
|
||||
opstamp: 0u64,
|
||||
payload: Some(stats),
|
||||
codec: codec_configuration,
|
||||
};
|
||||
|
||||
// save the meta.json
|
||||
@@ -256,7 +250,7 @@ pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory
|
||||
Ok(merged_index)
|
||||
}
|
||||
|
||||
pub(crate) struct InnerSegmentUpdater<C: Codec> {
|
||||
pub(crate) struct InnerSegmentUpdater {
|
||||
// we keep a copy of the current active IndexMeta to
|
||||
// avoid loading the file every time we need it in the
|
||||
// `SegmentUpdater`.
|
||||
@@ -267,7 +261,7 @@ pub(crate) struct InnerSegmentUpdater<C: Codec> {
|
||||
pool: ThreadPool,
|
||||
merge_thread_pool: ThreadPool,
|
||||
|
||||
index: Index<C>,
|
||||
index: Index,
|
||||
segment_manager: SegmentManager,
|
||||
merge_policy: RwLock<Arc<dyn MergePolicy>>,
|
||||
killed: AtomicBool,
|
||||
@@ -275,13 +269,13 @@ pub(crate) struct InnerSegmentUpdater<C: Codec> {
|
||||
merge_operations: MergeOperationInventory,
|
||||
}
|
||||
|
||||
impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
|
||||
impl SegmentUpdater {
|
||||
pub fn create(
|
||||
index: Index<Codec>,
|
||||
index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: &DeleteCursor,
|
||||
num_merge_threads: usize,
|
||||
) -> crate::Result<Self> {
|
||||
) -> crate::Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
let pool = ThreadPoolBuilder::new()
|
||||
@@ -409,15 +403,14 @@ impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
|
||||
// from the different drives.
|
||||
//
|
||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||
let codec = CodecConfiguration::from(index.codec());
|
||||
committed_segment_metas
|
||||
.sort_by_key(|segment_meta| std::cmp::Reverse(segment_meta.max_doc()));
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
segments: committed_segment_metas,
|
||||
schema: index.schema(),
|
||||
opstamp,
|
||||
payload: commit_message,
|
||||
codec,
|
||||
};
|
||||
// TODO add context to the error.
|
||||
save_metas(&index_meta, directory.box_clone().borrow_mut())?;
|
||||
@@ -451,7 +444,7 @@ impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
|
||||
opstamp: Opstamp,
|
||||
payload: Option<String>,
|
||||
) -> FutureResult<Opstamp> {
|
||||
let segment_updater: SegmentUpdater<Codec> = self.clone();
|
||||
let segment_updater: SegmentUpdater = self.clone();
|
||||
self.schedule_task(move || {
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
@@ -710,10 +703,10 @@ impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::merge_indices;
|
||||
use crate::codec::StandardCodec;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::index::{SegmentId, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::indexer::segment_updater::merge_filtered_segments;
|
||||
@@ -721,6 +714,22 @@ mod tests {
|
||||
use crate::schema::*;
|
||||
use crate::{Directory, DocAddress, Index, Segment};
|
||||
|
||||
#[test]
|
||||
fn test_segment_sort_large_max_doc() {
|
||||
// Regression test: -(max_doc as i32) overflows for max_doc >= 2^31.
|
||||
// Using std::cmp::Reverse avoids this.
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let mut metas = vec![
|
||||
inventory.new_segment_meta(SegmentId::generate_random(), 100),
|
||||
inventory.new_segment_meta(SegmentId::generate_random(), (1u32 << 31) - 1),
|
||||
inventory.new_segment_meta(SegmentId::generate_random(), 50_000),
|
||||
];
|
||||
metas.sort_by_key(|m| std::cmp::Reverse(m.max_doc()));
|
||||
assert_eq!(metas[0].max_doc(), (1u32 << 31) - 1);
|
||||
assert_eq!(metas[1].max_doc(), 50_000);
|
||||
assert_eq!(metas[2].max_doc(), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -924,7 +933,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_empty_indices_array() {
|
||||
let merge_result = merge_indices::<StandardCodec>(&[], Box::new(RamDirectory::default()));
|
||||
let merge_result = merge_indices(&[], RamDirectory::default());
|
||||
assert!(merge_result.is_err());
|
||||
}
|
||||
|
||||
@@ -951,10 +960,7 @@ mod tests {
|
||||
};
|
||||
|
||||
// mismatched schema index list
|
||||
let result = merge_indices(
|
||||
&[first_index, second_index],
|
||||
Box::new(RamDirectory::default()),
|
||||
);
|
||||
let result = merge_indices(&[first_index, second_index], RamDirectory::default());
|
||||
assert!(result.is_err());
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -4,7 +4,6 @@ use itertools::Itertools;
|
||||
use tokenizer_api::BoxTokenStream;
|
||||
|
||||
use super::operation::AddOperation;
|
||||
use crate::codec::Codec;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::index::{Segment, SegmentComponent};
|
||||
@@ -13,7 +12,7 @@ use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
|
||||
use crate::postings::{
|
||||
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter, PostingsWriterEnum,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::document::{Document, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED};
|
||||
@@ -46,11 +45,11 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is laid on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter<Codec: crate::codec::Codec> {
|
||||
pub struct SegmentWriter {
|
||||
pub(crate) max_doc: DocId,
|
||||
pub(crate) ctx: IndexingContext,
|
||||
pub(crate) per_field_postings_writers: PerFieldPostingsWriter,
|
||||
pub(crate) segment_serializer: SegmentSerializer<Codec>,
|
||||
pub(crate) segment_serializer: SegmentSerializer,
|
||||
pub(crate) fast_field_writers: FastFieldsWriter,
|
||||
pub(crate) fieldnorms_writer: FieldNormsWriter,
|
||||
pub(crate) json_path_writer: JsonPathWriter,
|
||||
@@ -61,7 +60,7 @@ pub struct SegmentWriter<Codec: crate::codec::Codec> {
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl<Codec: crate::codec::Codec> SegmentWriter<Codec> {
|
||||
impl SegmentWriter {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
@@ -71,10 +70,7 @@ impl<Codec: crate::codec::Codec> SegmentWriter<Codec> {
|
||||
/// behavior as a memory limit.
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
segment: Segment<Codec>,
|
||||
) -> crate::Result<Self> {
|
||||
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
||||
@@ -173,7 +169,7 @@ impl<Codec: crate::codec::Codec> SegmentWriter<Codec> {
|
||||
}
|
||||
|
||||
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
|
||||
let postings_writer: &mut PostingsWriterEnum =
|
||||
let postings_writer: &mut dyn PostingsWriter =
|
||||
self.per_field_postings_writers.get_for_field_mut(field);
|
||||
term_buffer.clear_with_field(field);
|
||||
|
||||
@@ -390,13 +386,13 @@ impl<Codec: crate::codec::Codec> SegmentWriter<Codec> {
|
||||
/// to the `SegmentSerializer`.
|
||||
///
|
||||
/// `doc_id_map` is used to map to the new doc_id order.
|
||||
fn remap_and_write<C: Codec>(
|
||||
fn remap_and_write(
|
||||
schema: Schema,
|
||||
per_field_postings_writers: &PerFieldPostingsWriter,
|
||||
ctx: IndexingContext,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: &FieldNormsWriter,
|
||||
mut serializer: SegmentSerializer<C>,
|
||||
mut serializer: SegmentSerializer,
|
||||
) -> crate::Result<()> {
|
||||
debug!("remap-and-write");
|
||||
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
|
||||
@@ -438,7 +434,7 @@ mod tests {
|
||||
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
|
||||
DATE_TIME_PRECISION_INDEXED, FAST, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::{Compressor, StoreWriter, TantivyStoreReader};
|
||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
@@ -486,8 +482,8 @@ mod tests {
|
||||
store_writer.store(&doc, &schema).unwrap();
|
||||
store_writer.close().unwrap();
|
||||
|
||||
let reader = TantivyStoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
|
||||
let doc = reader.get(0).unwrap();
|
||||
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
|
||||
let doc = reader.get::<TantivyDocument>(0).unwrap();
|
||||
|
||||
assert_eq!(doc.field_values().count(), 2);
|
||||
assert_eq!(
|
||||
@@ -604,12 +600,16 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let doc = searcher
|
||||
.doc(DocAddress {
|
||||
.doc::<TantivyDocument>(DocAddress {
|
||||
segment_ord: 0u32,
|
||||
doc_id: 0u32,
|
||||
})
|
||||
.unwrap();
|
||||
let serdeser_json_val = doc.to_json(&schema).get("json").unwrap().clone();
|
||||
let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
|
||||
.unwrap()
|
||||
.get("json")
|
||||
.unwrap()[0]
|
||||
.clone();
|
||||
assert_eq!(json_val, serdeser_json_val);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_idx = segment_reader.inverted_index(json_field).unwrap();
|
||||
@@ -871,7 +871,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
|
||||
fn assert_type(reader: &dyn SegmentReader, field: &str, typ: ColumnType) {
|
||||
fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) {
|
||||
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
|
||||
assert_eq!(cols.len(), 1, "{field}");
|
||||
assert_eq!(cols[0].column_type(), typ, "{field}");
|
||||
@@ -890,7 +890,7 @@ mod tests {
|
||||
assert_type(segment_reader, "json.my_arr", ColumnType::I64);
|
||||
assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str);
|
||||
|
||||
fn assert_empty(reader: &dyn SegmentReader, field: &str) {
|
||||
fn assert_empty(reader: &SegmentReader, field: &str) {
|
||||
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
|
||||
assert_eq!(cols.len(), 0);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::codec::StandardCodec;
|
||||
use crate::index::CodecConfiguration;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
@@ -9,25 +7,22 @@ use crate::schema::document::Document;
|
||||
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
|
||||
|
||||
#[doc(hidden)]
|
||||
pub struct SingleSegmentIndexWriter<
|
||||
Codec: crate::codec::Codec = StandardCodec,
|
||||
D: Document = TantivyDocument,
|
||||
> {
|
||||
segment_writer: SegmentWriter<Codec>,
|
||||
segment: Segment<Codec>,
|
||||
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
|
||||
segment_writer: SegmentWriter,
|
||||
segment: Segment,
|
||||
opstamp: Opstamp,
|
||||
_doc: PhantomData<D>,
|
||||
_phantom: PhantomData<D>,
|
||||
}
|
||||
|
||||
impl<Codec: crate::codec::Codec, D: Document> SingleSegmentIndexWriter<Codec, D> {
|
||||
pub fn new(index: Index<Codec>, mem_budget: usize) -> crate::Result<Self> {
|
||||
impl<D: Document> SingleSegmentIndexWriter<D> {
|
||||
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
|
||||
let segment = index.new_segment();
|
||||
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
|
||||
Ok(Self {
|
||||
segment_writer,
|
||||
segment,
|
||||
opstamp: 0,
|
||||
_doc: PhantomData,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -42,10 +37,10 @@ impl<Codec: crate::codec::Codec, D: Document> SingleSegmentIndexWriter<Codec, D>
|
||||
.add_document(AddOperation { opstamp, document })
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> crate::Result<Index<Codec>> {
|
||||
pub fn finalize(self) -> crate::Result<Index> {
|
||||
let max_doc = self.segment_writer.max_doc();
|
||||
self.segment_writer.finalize()?;
|
||||
let segment: Segment<Codec> = self.segment.with_max_doc(max_doc);
|
||||
let segment: Segment = self.segment.with_max_doc(max_doc);
|
||||
let index = segment.index();
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
@@ -53,7 +48,6 @@ impl<Codec: crate::codec::Codec, D: Document> SingleSegmentIndexWriter<Codec, D>
|
||||
schema: index.schema(),
|
||||
opstamp: 0,
|
||||
payload: None,
|
||||
codec: CodecConfiguration::from(index.codec()),
|
||||
};
|
||||
save_metas(&index_meta, index.directory())?;
|
||||
index.directory().sync_directory()?;
|
||||
|
||||
19
src/lib.rs
19
src/lib.rs
@@ -93,7 +93,7 @@
|
||||
//!
|
||||
//! for (_score, doc_address) in top_docs {
|
||||
//! // Retrieve the actual content of documents given its `doc_address`.
|
||||
//! let retrieved_doc = searcher.doc(doc_address)?;
|
||||
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||
//! println!("{}", retrieved_doc.to_json(&schema));
|
||||
//! }
|
||||
//!
|
||||
@@ -166,14 +166,13 @@ mod functional_test;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
/// Tantivy codecs describes how data is layed out on disk.
|
||||
pub mod codec;
|
||||
mod future_result;
|
||||
|
||||
// Re-exports
|
||||
pub use columnar;
|
||||
pub use common::{ByteCount, DateTime};
|
||||
pub use {columnar, query_grammar, time};
|
||||
pub use query_grammar;
|
||||
pub use time;
|
||||
|
||||
pub use crate::error::TantivyError;
|
||||
pub use crate::future_result::FutureResult;
|
||||
@@ -224,11 +223,11 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||
pub use crate::core::{json_utils, Executor, Searcher, SearcherContext, SearcherGeneration};
|
||||
pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||
SegmentMeta, SegmentReader, TantivyInvertedIndexReader, TantivySegmentReader,
|
||||
SegmentMeta, SegmentReader,
|
||||
};
|
||||
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
@@ -548,7 +547,7 @@ pub mod tests {
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &dyn SegmentReader = searcher.segment_reader(0);
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
|
||||
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
|
||||
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
|
||||
@@ -556,7 +555,7 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn advance_undeleted(docset: &mut dyn DocSet, reader: &dyn SegmentReader) -> bool {
|
||||
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
|
||||
let mut doc = docset.advance();
|
||||
while doc != TERMINATED {
|
||||
if !reader.is_deleted(doc) {
|
||||
@@ -1073,7 +1072,7 @@ pub mod tests {
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &dyn SegmentReader = searcher.segment_reader(0);
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_fields().u64("text");
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
|
||||
@@ -1,19 +1,28 @@
|
||||
use std::io;
|
||||
|
||||
use common::{OwnedBytes, VInt};
|
||||
use common::VInt;
|
||||
|
||||
use crate::codec::standard::postings::FreqReadingOption;
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder as _, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::skip::{BlockInfo, SkipReader};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
|
||||
it.next().map(|first| it.fold(first, Score::max))
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct BlockSegmentPostings {
|
||||
pub struct BlockSegmentPostings {
|
||||
pub(crate) doc_decoder: BlockDecoder,
|
||||
block_loaded: bool,
|
||||
freq_decoder: BlockDecoder,
|
||||
@@ -79,7 +88,7 @@ fn split_into_skips_and_postings(
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
/// Opens a `StandardPostingsReader`.
|
||||
/// Opens a `BlockSegmentPostings`.
|
||||
/// `doc_freq` is the number of documents in the posting list.
|
||||
/// `record_option` represents the amount of data available according to the schema.
|
||||
/// `requested_option` is the amount of data requested by the user.
|
||||
@@ -87,10 +96,11 @@ impl BlockSegmentPostings {
|
||||
/// term frequency blocks.
|
||||
pub(crate) fn open(
|
||||
doc_freq: u32,
|
||||
bytes: OwnedBytes,
|
||||
data: FileSlice,
|
||||
mut record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> io::Result<BlockSegmentPostings> {
|
||||
let bytes = data.read_bytes()?;
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => {
|
||||
@@ -128,86 +138,6 @@ impl BlockSegmentPostings {
|
||||
block_segment_postings.load_block();
|
||||
Ok(block_segment_postings)
|
||||
}
|
||||
}
|
||||
|
||||
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
|
||||
it.next().map(|first| it.fold(first, Score::max))
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
/// Returns the overall number of documents in the block postings.
|
||||
/// It does not take in account whether documents are deleted or not.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
debug_assert!(self.block_loaded);
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
debug_assert!(self.block_loaded);
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
debug_assert!(self.block_loaded);
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Position on a block that may contains `target_doc`.
|
||||
///
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) -> usize {
|
||||
// Move to the block that might contain our document.
|
||||
self.seek_block_without_loading(target_doc);
|
||||
self.load_block();
|
||||
|
||||
// At this point we are on the block that might contain our document.
|
||||
let doc = self.doc_decoder.seek_within_block(target_doc);
|
||||
|
||||
// The last block is not full and padded with TERMINATED,
|
||||
// so we are guaranteed to have at least one value (real or padding)
|
||||
// that is >= target_doc.
|
||||
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target_doc`.
|
||||
// If all docs are smaller than target, the current block is incomplete and padded
|
||||
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
|
||||
doc
|
||||
}
|
||||
|
||||
pub fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
pub fn advance(&mut self) {
|
||||
self.skip_reader.advance();
|
||||
self.block_loaded = false;
|
||||
self.block_max_score_cache = None;
|
||||
self.load_block();
|
||||
}
|
||||
|
||||
/// Returns the block_max_score for the current block.
|
||||
/// It does not require the block to be loaded. For instance, it is ok to call this method
|
||||
@@ -230,7 +160,7 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
// this is the last block of the segment posting list.
|
||||
// If it is actually loaded, we can compute block max manually.
|
||||
if self.block_loaded {
|
||||
if self.block_is_loaded() {
|
||||
let docs = self.doc_decoder.output_array().iter().cloned();
|
||||
let freqs = self.freq_decoder.output_array().iter().cloned();
|
||||
let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| {
|
||||
@@ -247,25 +177,112 @@ impl BlockSegmentPostings {
|
||||
// We do not cache it however, so that it gets computed when once block is loaded.
|
||||
bm25_weight.max_score()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
block_loaded: true,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
block_max_score_cache: None,
|
||||
doc_freq: 0,
|
||||
data: OwnedBytes::empty(),
|
||||
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
|
||||
}
|
||||
pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
self.freq_reading_option
|
||||
}
|
||||
|
||||
pub(crate) fn skip_reader(&self) -> &SkipReader {
|
||||
&self.skip_reader
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> {
|
||||
let (skip_data_opt, postings_data) =
|
||||
split_into_skips_and_postings(doc_freq, postings_data)?;
|
||||
self.data = postings_data;
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data, doc_freq);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedBytes::empty(), doc_freq);
|
||||
}
|
||||
self.doc_freq = doc_freq;
|
||||
self.load_block();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the block postings.
|
||||
/// It does not take in account whether documents are deleted or not.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
pub fn block_len(&self) -> usize {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// Position on a block that may contains `target_doc`.
|
||||
///
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) -> usize {
|
||||
// Move to the block that might contain our document.
|
||||
self.seek_block(target_doc);
|
||||
self.load_block();
|
||||
|
||||
// At this point we are on the block that might contain our document.
|
||||
let doc = self.doc_decoder.seek_within_block(target_doc);
|
||||
|
||||
// The last block is not full and padded with TERMINATED,
|
||||
// so we are guaranteed to have at least one value (real or padding)
|
||||
// that is >= target_doc.
|
||||
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target_doc`.
|
||||
// If all docs are smaller than target, the current block is incomplete and padded
|
||||
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
|
||||
doc
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
|
||||
/// Dangerous API! This calls seeks the next block on the skip list,
|
||||
@@ -274,15 +291,19 @@ impl BlockSegmentPostings {
|
||||
/// `.load_block()` needs to be called manually afterwards.
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub(crate) fn seek_block_without_loading(&mut self, target_doc: DocId) {
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn block_is_loaded(&self) -> bool {
|
||||
self.block_loaded
|
||||
}
|
||||
|
||||
pub(crate) fn load_block(&mut self) {
|
||||
if self.block_loaded {
|
||||
if self.block_is_loaded() {
|
||||
return;
|
||||
}
|
||||
let offset = self.skip_reader.byte_offset();
|
||||
@@ -330,39 +351,68 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
self.block_loaded = true;
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
pub fn advance(&mut self) {
|
||||
self.skip_reader.advance();
|
||||
self.block_loaded = false;
|
||||
self.block_max_score_cache = None;
|
||||
self.load_block();
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
block_loaded: true,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
block_max_score_cache: None,
|
||||
doc_freq: 0,
|
||||
data: OwnedBytes::empty(),
|
||||
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn skip_reader(&self) -> &SkipReader {
|
||||
&self.skip_reader
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common::OwnedBytes;
|
||||
use common::HasLen;
|
||||
|
||||
use super::BlockSegmentPostings;
|
||||
use crate::codec::standard::postings::segment_postings::SegmentPostings;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::Index;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::postings::postings::Postings;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::schema::{IndexRecordOption, Schema, Term, INDEXED};
|
||||
use crate::DocId;
|
||||
|
||||
#[cfg(test)]
|
||||
fn build_block_postings(docs: &[u32]) -> BlockSegmentPostings {
|
||||
let doc_freq = docs.len() as u32;
|
||||
let mut postings_serializer =
|
||||
PostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None);
|
||||
postings_serializer.new_term(docs.len() as u32, false);
|
||||
for doc in docs {
|
||||
postings_serializer.write_doc(*doc, 1u32);
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
postings_serializer
|
||||
.close_term(doc_freq, &mut buffer)
|
||||
.unwrap();
|
||||
BlockSegmentPostings::open(
|
||||
doc_freq,
|
||||
OwnedBytes::new(buffer),
|
||||
IndexRecordOption::Basic,
|
||||
IndexRecordOption::Basic,
|
||||
)
|
||||
.unwrap()
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.doc(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_postings_doc_returns_terminated() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.doc(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_postings_doc_term_freq_returns_0() {
|
||||
let postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -377,7 +427,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() -> crate::Result<()> {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?;
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the `doc_freq` is correct
|
||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||
@@ -402,7 +452,7 @@ mod tests {
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids)?;
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(128), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
@@ -411,7 +461,7 @@ mod tests {
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids).unwrap();
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(129), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
@@ -420,7 +470,7 @@ mod tests {
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids)?;
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.doc(), 0);
|
||||
assert_eq!(docset.seek(131), TERMINATED);
|
||||
@@ -429,13 +479,38 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut last_doc = 0u32;
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64))?;
|
||||
}
|
||||
index_writer.add_document(doc!(int_field=>0u64))?;
|
||||
last_doc = doc + 1;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term)?.unwrap();
|
||||
let block_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
|
||||
Ok(block_postings)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_seek() -> crate::Result<()> {
|
||||
let mut docs = Vec::new();
|
||||
let mut docs = vec![0];
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
let mut block_postings = build_block_postings(&docs[..])?;
|
||||
for i in &[0, 424, 10000] {
|
||||
block_postings.seek(*i);
|
||||
let docs = block_postings.docs();
|
||||
@@ -446,4 +521,40 @@ mod tests {
|
||||
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reset_block_segment_postings() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
// create two postings list, one containing even number,
|
||||
// the other containing odd numbers.
|
||||
for i in 0..6 {
|
||||
let doc = doc!(int_field=> (i % 2) as u64);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field)?;
|
||||
let term_info = inverted_index.get_term_info(&term)?.unwrap();
|
||||
block_segments = inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
|
||||
}
|
||||
assert_eq!(block_segments.docs(), &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field)?;
|
||||
let term_info = inverted_index.get_term_info(&term)?.unwrap();
|
||||
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?;
|
||||
}
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,12 @@ pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
|
||||
non_str_posting_writer: SpecializedPostingsWriter<DocIdRecorder>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(json_postings_writer: JsonPostingsWriter<Rec>) -> Box<dyn PostingsWriter> {
|
||||
Box::new(json_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
#[inline]
|
||||
fn subscribe(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::{DocFreq, Postings};
|
||||
use crate::postings::{Postings, SegmentPostings};
|
||||
use crate::DocId;
|
||||
|
||||
/// `LoadedPostings` is a `DocSet` and `Postings` implementation.
|
||||
@@ -25,16 +25,16 @@ impl LoadedPostings {
|
||||
/// Creates a new `LoadedPostings` from a `SegmentPostings`.
|
||||
///
|
||||
/// It will also preload positions, if positions are available in the SegmentPostings.
|
||||
pub fn load(postings: &mut Box<dyn Postings>) -> LoadedPostings {
|
||||
let num_docs: usize = u32::from(postings.doc_freq()) as usize;
|
||||
pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings {
|
||||
let num_docs = segment_postings.doc_freq() as usize;
|
||||
let mut doc_ids = Vec::with_capacity(num_docs);
|
||||
let mut positions = Vec::with_capacity(num_docs);
|
||||
let mut position_offsets = Vec::with_capacity(num_docs);
|
||||
while postings.doc() != TERMINATED {
|
||||
while segment_postings.doc() != TERMINATED {
|
||||
position_offsets.push(positions.len() as u32);
|
||||
doc_ids.push(postings.doc());
|
||||
postings.append_positions_with_offset(0, &mut positions);
|
||||
postings.advance();
|
||||
doc_ids.push(segment_postings.doc());
|
||||
segment_postings.append_positions_with_offset(0, &mut positions);
|
||||
segment_postings.advance();
|
||||
}
|
||||
position_offsets.push(positions.len() as u32);
|
||||
LoadedPostings {
|
||||
@@ -101,14 +101,6 @@ impl Postings for LoadedPostings {
|
||||
output.push(*pos + offset);
|
||||
}
|
||||
}
|
||||
|
||||
fn has_freq(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn doc_freq(&self) -> DocFreq {
|
||||
DocFreq::Exact(self.doc_ids.len() as u32)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -4,6 +4,7 @@ mod block_search;
|
||||
|
||||
pub(crate) use self::block_search::branchless_binary_search;
|
||||
|
||||
mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
mod indexing_context;
|
||||
mod json_postings_writer;
|
||||
@@ -12,24 +13,33 @@ mod per_field_postings_writer;
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
/// Serializer module for the inverted index
|
||||
pub mod serializer;
|
||||
pub(crate) mod skip;
|
||||
mod skip;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use loaded_postings::LoadedPostings;
|
||||
pub use postings::DocFreq;
|
||||
pub(crate) use stacker::compute_table_memory_size;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub(crate) use self::indexing_context::IndexingContext;
|
||||
pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter;
|
||||
pub use self::postings::Postings;
|
||||
pub(crate) use self::postings_writer::{
|
||||
serialize_postings, IndexingPosition, PostingsWriter, PostingsWriterEnum,
|
||||
};
|
||||
pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter};
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
#[expect(clippy::enum_variant_names)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
|
||||
pub(crate) enum FreqReadingOption {
|
||||
NoFreq,
|
||||
SkipFreq,
|
||||
ReadFreq,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::mem;
|
||||
@@ -37,10 +47,9 @@ pub(crate) mod tests {
|
||||
use super::{InvertedIndexSerializer, Postings};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::index::{Index, SegmentComponent};
|
||||
use crate::index::{Index, SegmentComponent, SegmentReader};
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::postings::DocFreq;
|
||||
use crate::query::Scorer;
|
||||
use crate::schema::{
|
||||
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
|
||||
@@ -250,13 +259,7 @@ pub(crate) mod tests {
|
||||
segment_writer.finalize()?;
|
||||
}
|
||||
{
|
||||
let segment_reader = crate::codec::Codec::open_segment_reader(
|
||||
segment.index().codec(),
|
||||
segment.index().directory(),
|
||||
segment.meta(),
|
||||
segment.schema(),
|
||||
None,
|
||||
)?;
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
{
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
|
||||
@@ -277,11 +280,11 @@ pub(crate) mod tests {
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let mut postings_a: Box<dyn Postings> = segment_reader
|
||||
let mut postings_a = segment_reader
|
||||
.inverted_index(term_a.field())?
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
|
||||
.unwrap();
|
||||
assert_eq!(postings_a.doc_freq(), DocFreq::Exact(1000));
|
||||
assert_eq!(postings_a.len(), 1000);
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
assert_eq!(postings_a.term_freq(), 6);
|
||||
postings_a.positions(&mut positions);
|
||||
@@ -304,7 +307,7 @@ pub(crate) mod tests {
|
||||
.inverted_index(term_e.field())?
|
||||
.read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)?
|
||||
.unwrap();
|
||||
assert_eq!(postings_e.doc_freq(), DocFreq::Exact(1000 - 2));
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
for i in 2u32..1000u32 {
|
||||
assert_eq!(postings_e.term_freq(), i);
|
||||
postings_e.positions(&mut positions);
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::postings_writer::{PostingsWriterEnum, SpecializedPostingsWriter};
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{DocIdRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
|
||||
use crate::postings::PostingsWriter;
|
||||
use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
|
||||
|
||||
pub(crate) struct PerFieldPostingsWriter {
|
||||
per_field_postings_writers: Vec<PostingsWriterEnum>,
|
||||
per_field_postings_writers: Vec<Box<dyn PostingsWriter>>,
|
||||
}
|
||||
|
||||
impl PerFieldPostingsWriter {
|
||||
pub fn for_schema(schema: &Schema) -> Self {
|
||||
let per_field_postings_writers: Vec<PostingsWriterEnum> = schema
|
||||
let per_field_postings_writers = schema
|
||||
.fields()
|
||||
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
|
||||
.collect();
|
||||
@@ -18,16 +19,16 @@ impl PerFieldPostingsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_for_field(&self, field: Field) -> &PostingsWriterEnum {
|
||||
&self.per_field_postings_writers[field.field_id() as usize]
|
||||
pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter {
|
||||
self.per_field_postings_writers[field.field_id() as usize].as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut PostingsWriterEnum {
|
||||
&mut self.per_field_postings_writers[field.field_id() as usize]
|
||||
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
|
||||
self.per_field_postings_writers[field.field_id() as usize].as_mut()
|
||||
}
|
||||
}
|
||||
|
||||
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> PostingsWriterEnum {
|
||||
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
@@ -50,7 +51,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> PostingsWriterEn
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::IpAddr(_)
|
||||
| FieldType::Facet(_) => <SpecializedPostingsWriter<DocIdRecorder>>::default().into(),
|
||||
| FieldType::Facet(_) => Box::<SpecializedPostingsWriter<DocIdRecorder>>::default(),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {
|
||||
match text_indexing_option.index_option() {
|
||||
|
||||
@@ -1,25 +1,5 @@
|
||||
use crate::docset::DocSet;
|
||||
|
||||
/// Result of the doc_freq method.
|
||||
///
|
||||
/// Postings can inform us that the document frequency is approximate.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DocFreq {
|
||||
/// The document frequency is approximate.
|
||||
Approximate(u32),
|
||||
/// The document frequency is exact.
|
||||
Exact(u32),
|
||||
}
|
||||
|
||||
impl From<DocFreq> for u32 {
|
||||
fn from(doc_freq: DocFreq) -> Self {
|
||||
match doc_freq {
|
||||
DocFreq::Approximate(approximate_doc_freq) => approximate_doc_freq,
|
||||
DocFreq::Exact(doc_freq) => doc_freq,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Postings (also called inverted list)
|
||||
///
|
||||
/// For a given term, it is the list of doc ids of the doc
|
||||
@@ -34,9 +14,6 @@ pub trait Postings: DocSet + 'static {
|
||||
/// The number of times the term appears in the document.
|
||||
fn term_freq(&self) -> u32;
|
||||
|
||||
/// Returns the number of documents containing the term in the segment.
|
||||
fn doc_freq(&self) -> DocFreq;
|
||||
|
||||
/// Returns the positions offsetted with a given value.
|
||||
/// It is not necessary to clear the `output` before calling this method.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
@@ -54,16 +31,6 @@ pub trait Postings: DocSet + 'static {
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.positions_with_offset(0u32, output);
|
||||
}
|
||||
|
||||
/// Returns true if the term_frequency is available.
|
||||
///
|
||||
/// This is a tricky question, because on JSON fields, it is possible
|
||||
/// for a text term to have term freq, whereas a number term in the field has none.
|
||||
///
|
||||
/// This function returns whether the actual term has term frequencies or not.
|
||||
/// In this above JSON field example, `has_freq` should return true for the
|
||||
/// earlier and false for the latter.
|
||||
fn has_freq(&self) -> bool;
|
||||
}
|
||||
|
||||
impl Postings for Box<dyn Postings> {
|
||||
@@ -74,12 +41,4 @@ impl Postings for Box<dyn Postings> {
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
(**self).append_positions_with_offset(offset, output);
|
||||
}
|
||||
|
||||
fn has_freq(&self) -> bool {
|
||||
(**self).has_freq()
|
||||
}
|
||||
|
||||
fn doc_freq(&self) -> DocFreq {
|
||||
(**self).doc_freq()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,10 +7,7 @@ use stacker::Addr;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::indexing_term::IndexingTerm;
|
||||
use crate::indexer::path_to_unordered_id::OrderedPathId;
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::recorder::{
|
||||
BufferLender, DocIdRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder,
|
||||
};
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
|
||||
};
|
||||
@@ -103,141 +100,6 @@ pub(crate) struct IndexingPosition {
|
||||
pub end_position: u32,
|
||||
}
|
||||
|
||||
pub enum PostingsWriterEnum {
|
||||
DocId(SpecializedPostingsWriter<DocIdRecorder>),
|
||||
DocIdTf(SpecializedPostingsWriter<TermFrequencyRecorder>),
|
||||
DocTfAndPosition(SpecializedPostingsWriter<TfAndPositionRecorder>),
|
||||
JsonDocId(JsonPostingsWriter<DocIdRecorder>),
|
||||
JsonDocIdTf(JsonPostingsWriter<TermFrequencyRecorder>),
|
||||
JsonDocTfAndPosition(JsonPostingsWriter<TfAndPositionRecorder>),
|
||||
}
|
||||
|
||||
impl From<SpecializedPostingsWriter<DocIdRecorder>> for PostingsWriterEnum {
|
||||
fn from(doc_id_recorder_writer: SpecializedPostingsWriter<DocIdRecorder>) -> Self {
|
||||
PostingsWriterEnum::DocId(doc_id_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SpecializedPostingsWriter<TermFrequencyRecorder>> for PostingsWriterEnum {
|
||||
fn from(doc_id_tf_recorder_writer: SpecializedPostingsWriter<TermFrequencyRecorder>) -> Self {
|
||||
PostingsWriterEnum::DocIdTf(doc_id_tf_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SpecializedPostingsWriter<TfAndPositionRecorder>> for PostingsWriterEnum {
|
||||
fn from(
|
||||
doc_id_tf_and_positions_recorder_writer: SpecializedPostingsWriter<TfAndPositionRecorder>,
|
||||
) -> Self {
|
||||
PostingsWriterEnum::DocTfAndPosition(doc_id_tf_and_positions_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<JsonPostingsWriter<DocIdRecorder>> for PostingsWriterEnum {
|
||||
fn from(doc_id_recorder_writer: JsonPostingsWriter<DocIdRecorder>) -> Self {
|
||||
PostingsWriterEnum::JsonDocId(doc_id_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<JsonPostingsWriter<TermFrequencyRecorder>> for PostingsWriterEnum {
|
||||
fn from(doc_id_tf_recorder_writer: JsonPostingsWriter<TermFrequencyRecorder>) -> Self {
|
||||
PostingsWriterEnum::JsonDocIdTf(doc_id_tf_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<JsonPostingsWriter<TfAndPositionRecorder>> for PostingsWriterEnum {
|
||||
fn from(
|
||||
doc_id_tf_and_positions_recorder_writer: JsonPostingsWriter<TfAndPositionRecorder>,
|
||||
) -> Self {
|
||||
PostingsWriterEnum::JsonDocTfAndPosition(doc_id_tf_and_positions_recorder_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl PostingsWriter for PostingsWriterEnum {
|
||||
fn subscribe(&mut self, doc: DocId, pos: u32, term: &IndexingTerm, ctx: &mut IndexingContext) {
|
||||
match self {
|
||||
PostingsWriterEnum::DocId(writer) => writer.subscribe(doc, pos, term, ctx),
|
||||
PostingsWriterEnum::DocIdTf(writer) => writer.subscribe(doc, pos, term, ctx),
|
||||
PostingsWriterEnum::DocTfAndPosition(writer) => writer.subscribe(doc, pos, term, ctx),
|
||||
PostingsWriterEnum::JsonDocId(writer) => writer.subscribe(doc, pos, term, ctx),
|
||||
PostingsWriterEnum::JsonDocIdTf(writer) => writer.subscribe(doc, pos, term, ctx),
|
||||
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
|
||||
writer.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||
ordered_id_to_path: &[&str],
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
match self {
|
||||
PostingsWriterEnum::DocId(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
PostingsWriterEnum::DocIdTf(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
PostingsWriterEnum::DocTfAndPosition(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocId(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocIdTf(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
|
||||
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
fn index_text(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut IndexingTerm,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
match self {
|
||||
PostingsWriterEnum::DocId(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
PostingsWriterEnum::DocIdTf(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
PostingsWriterEnum::DocTfAndPosition(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocId(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocIdTf(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
|
||||
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
match self {
|
||||
PostingsWriterEnum::DocId(writer) => writer.total_num_tokens(),
|
||||
PostingsWriterEnum::DocIdTf(writer) => writer.total_num_tokens(),
|
||||
PostingsWriterEnum::DocTfAndPosition(writer) => writer.total_num_tokens(),
|
||||
PostingsWriterEnum::JsonDocId(writer) => writer.total_num_tokens(),
|
||||
PostingsWriterEnum::JsonDocIdTf(writer) => writer.total_num_tokens(),
|
||||
PostingsWriterEnum::JsonDocTfAndPosition(writer) => writer.total_num_tokens(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
@@ -309,6 +171,14 @@ pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(
|
||||
specialized_postings_writer: SpecializedPostingsWriter<Rec>,
|
||||
) -> Box<dyn PostingsWriter> {
|
||||
Box::new(specialized_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
#[inline]
|
||||
pub(crate) fn serialize_one_term(
|
||||
|
||||
@@ -70,7 +70,7 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
);
|
||||
/// Returns the number of document containing this term.
|
||||
@@ -113,7 +113,7 @@ impl Recorder for DocIdRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
@@ -181,7 +181,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
@@ -238,7 +238,7 @@ impl Recorder for TfAndPositionRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
) {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
use common::BitSet;
|
||||
use common::HasLen;
|
||||
|
||||
use super::BlockSegmentPostings;
|
||||
use crate::codec::postings::PostingsWithBlockMax;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::{DocFreq, Postings};
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::{DocId, Score};
|
||||
use crate::postings::{BlockSegmentPostings, Postings};
|
||||
use crate::{DocId, TERMINATED};
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated with
|
||||
/// a term in a `Segment`.
|
||||
@@ -32,6 +29,31 @@ impl SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the number of non-deleted documents.
|
||||
///
|
||||
/// This method will clone and scan through the posting lists.
|
||||
/// (this is a rather expensive operation).
|
||||
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let mut docset = self.clone();
|
||||
let mut doc_freq = 0;
|
||||
loop {
|
||||
let doc = docset.doc();
|
||||
if doc == TERMINATED {
|
||||
return doc_freq;
|
||||
}
|
||||
if alive_bitset.is_alive(doc) {
|
||||
doc_freq += 1u32;
|
||||
}
|
||||
docset.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the block postings.
|
||||
/// It does not take in account whether documents are deleted or not.
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
|
||||
/// Creates a segment postings object with the given documents
|
||||
/// and no frequency encoded.
|
||||
///
|
||||
@@ -42,13 +64,11 @@ impl SegmentPostings {
|
||||
/// buffer with the serialized data.
|
||||
#[cfg(test)]
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use crate::directory::FileSlice;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::schema::IndexRecordOption;
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
|
||||
let mut postings_serializer =
|
||||
PostingsSerializer::new(0.0, IndexRecordOption::Basic, None);
|
||||
postings_serializer.new_term(docs.len() as u32, false);
|
||||
@@ -61,7 +81,7 @@ impl SegmentPostings {
|
||||
}
|
||||
let block_segment_postings = BlockSegmentPostings::open(
|
||||
docs.len() as u32,
|
||||
OwnedBytes::new(buffer),
|
||||
FileSlice::from(buffer),
|
||||
IndexRecordOption::Basic,
|
||||
IndexRecordOption::Basic,
|
||||
)
|
||||
@@ -75,8 +95,7 @@ impl SegmentPostings {
|
||||
doc_and_tfs: &[(u32, u32)],
|
||||
fieldnorms: Option<&[u32]>,
|
||||
) -> SegmentPostings {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::schema::IndexRecordOption;
|
||||
@@ -109,7 +128,7 @@ impl SegmentPostings {
|
||||
.unwrap();
|
||||
let block_segment_postings = BlockSegmentPostings::open(
|
||||
doc_and_tfs.len() as u32,
|
||||
OwnedBytes::new(buffer),
|
||||
FileSlice::from(buffer),
|
||||
IndexRecordOption::WithFreqs,
|
||||
IndexRecordOption::WithFreqs,
|
||||
)
|
||||
@@ -139,6 +158,7 @@ impl DocSet for SegmentPostings {
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> DocId {
|
||||
debug_assert!(self.block_cursor.block_is_loaded());
|
||||
if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
|
||||
self.cur = 0;
|
||||
self.block_cursor.advance();
|
||||
@@ -177,31 +197,13 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.doc_freq().into()
|
||||
self.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_bitset(&mut self, bitset: &mut BitSet) {
|
||||
let bitset_max_value: DocId = bitset.max_value();
|
||||
loop {
|
||||
let docs = self.block_cursor.docs();
|
||||
let Some(&last_doc) = docs.last() else {
|
||||
break;
|
||||
};
|
||||
if last_doc < bitset_max_value {
|
||||
// All docs are within the range of the bitset
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
} else {
|
||||
for &doc in docs {
|
||||
if doc < bitset_max_value {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
self.block_cursor.advance();
|
||||
}
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq() as usize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,13 +229,6 @@ impl Postings for SegmentPostings {
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the block postings.
|
||||
/// It does not take in account whether documents are deleted or not.
|
||||
#[inline(always)]
|
||||
fn doc_freq(&self) -> DocFreq {
|
||||
DocFreq::Exact(self.block_cursor.doc_freq())
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let term_freq = self.term_freq();
|
||||
let prev_len = output.len();
|
||||
@@ -257,44 +252,24 @@ impl Postings for SegmentPostings {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn has_freq(&self) -> bool {
|
||||
!self.block_cursor.freqs().is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl PostingsWithBlockMax for SegmentPostings {
|
||||
#[inline]
|
||||
fn seek_block_max(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
similarity_weight: &Bm25Weight,
|
||||
) -> Score {
|
||||
self.block_cursor.seek_block_without_loading(target_doc);
|
||||
self.block_cursor
|
||||
.block_max_score(fieldnorm_reader, similarity_weight)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn last_doc_in_block(&self) -> crate::DocId {
|
||||
self.block_cursor.skip_reader().last_doc_in_block()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
use super::SegmentPostings;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::Postings;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::postings::postings::Postings;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.doc(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
assert_eq!(postings.doc_freq(), crate::postings::DocFreq::Exact(0));
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -309,4 +284,15 @@ mod tests {
|
||||
let postings = SegmentPostings::empty();
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_freq() {
|
||||
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(docs.doc_freq(), 3);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
|
||||
let all_deleted =
|
||||
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::index::Segment;
|
||||
use crate::positions::PositionSerializer;
|
||||
use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::skip::SkipSerializer;
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::{Field, FieldEntry, IndexRecordOption, Schema};
|
||||
@@ -55,9 +55,7 @@ pub struct InvertedIndexSerializer {
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||
pub fn open<C: crate::codec::Codec>(
|
||||
segment: &mut Segment<C>,
|
||||
) -> crate::Result<InvertedIndexSerializer> {
|
||||
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
|
||||
use crate::index::SegmentComponent::{Positions, Postings, Terms};
|
||||
let inv_index_serializer = InvertedIndexSerializer {
|
||||
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
|
||||
|
||||
@@ -146,6 +146,23 @@ impl SkipReader {
|
||||
skip_reader
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
|
||||
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
0
|
||||
} else {
|
||||
TERMINATED
|
||||
};
|
||||
self.last_doc_in_previous_block = 0u32;
|
||||
self.owned_read = data;
|
||||
self.block_info = BlockInfo::VInt { num_docs: doc_freq };
|
||||
self.byte_offset = 0;
|
||||
self.remaining_docs = doc_freq;
|
||||
self.position_offset = 0u64;
|
||||
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
self.read_block_info();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the block max score for this block if available.
|
||||
//
|
||||
// The block max score is available for all full bitpacked block,
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::boost_query::BoostScorer;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
@@ -21,16 +21,16 @@ impl Query for AllQuery {
|
||||
pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let all_scorer = AllScorer::new(reader.max_doc());
|
||||
if boost != 1.0 {
|
||||
Ok(box_scorer(BoostScorer::new(all_scorer, boost)))
|
||||
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
||||
} else {
|
||||
Ok(box_scorer(all_scorer))
|
||||
Ok(Box::new(all_scorer))
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
if doc >= reader.max_doc() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::postings::TermInfo;
|
||||
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::{DocId, DocSet, Score, TantivyError};
|
||||
use crate::{DocId, Score, TantivyError};
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A> {
|
||||
@@ -67,7 +67,7 @@ where
|
||||
}
|
||||
|
||||
/// Returns the term infos that match the automaton
|
||||
pub fn get_match_term_infos(&self, reader: &dyn SegmentReader) -> crate::Result<Vec<TermInfo>> {
|
||||
pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result<Vec<TermInfo>> {
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_stream = self.automaton_stream(term_dict)?;
|
||||
@@ -84,7 +84,7 @@ where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
A::State: Clone,
|
||||
{
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
@@ -92,18 +92,25 @@ where
|
||||
let mut term_stream = self.automaton_stream(term_dict)?;
|
||||
while term_stream.advance() {
|
||||
let term_info = term_stream.value();
|
||||
inverted_index.fill_bitset_for_term(
|
||||
term_info,
|
||||
IndexRecordOption::Basic,
|
||||
&mut doc_bitset,
|
||||
)?;
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
|
||||
loop {
|
||||
let docs = block_segment_postings.docs();
|
||||
if docs.is_empty() {
|
||||
break;
|
||||
}
|
||||
for &doc in docs {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
block_segment_postings.advance();
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
let const_scorer = ConstScorer::new(doc_bitset, boost);
|
||||
Ok(Box::new(const_scorer))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) == doc {
|
||||
Ok(Explanation::new("AutomatonScorer", 1.0))
|
||||
|
||||
@@ -24,13 +24,6 @@ impl BitSetDocSet {
|
||||
self.cursor_bucket = bucket_addr;
|
||||
self.cursor_tinybitset = self.docs.tinyset(bucket_addr);
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the bitset.
|
||||
///
|
||||
/// This call is not free: it will bitcount the number of bits in the bitset.
|
||||
pub fn doc_freq(&self) -> u32 {
|
||||
self.docs.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BitSet> for BitSetDocSet {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use crate::codec::postings::PostingsWithBlockMax;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
@@ -14,8 +13,8 @@ use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
/// We always have `before_pivot_len` < `pivot_len`.
|
||||
///
|
||||
/// `None` is returned if we establish that no document can exceed the threshold.
|
||||
fn find_pivot_doc<TPostings: PostingsWithBlockMax>(
|
||||
term_scorers: &[TermScorerWithMaxScore<TPostings>],
|
||||
fn find_pivot_doc(
|
||||
term_scorers: &[TermScorerWithMaxScore],
|
||||
threshold: Score,
|
||||
) -> Option<(usize, usize, DocId)> {
|
||||
let mut max_score = 0.0;
|
||||
@@ -47,8 +46,8 @@ fn find_pivot_doc<TPostings: PostingsWithBlockMax>(
|
||||
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
|
||||
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
|
||||
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
|
||||
fn block_max_was_too_low_advance_one_scorer<TPostings: PostingsWithBlockMax>(
|
||||
scorers: &mut [TermScorerWithMaxScore<TPostings>],
|
||||
fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers: &mut [TermScorerWithMaxScore],
|
||||
pivot_len: usize,
|
||||
) {
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
@@ -83,10 +82,7 @@ fn block_max_was_too_low_advance_one_scorer<TPostings: PostingsWithBlockMax>(
|
||||
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
|
||||
// except term_scorers[ord] that might be in advance compared to its ranks,
|
||||
// bubble up term_scorers[ord] in order to restore the ordering.
|
||||
fn restore_ordering<TPostings: PostingsWithBlockMax>(
|
||||
term_scorers: &mut [TermScorerWithMaxScore<TPostings>],
|
||||
ord: usize,
|
||||
) {
|
||||
fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
|
||||
let doc = term_scorers[ord].doc();
|
||||
for i in ord + 1..term_scorers.len() {
|
||||
if term_scorers[i].doc() >= doc {
|
||||
@@ -101,10 +97,9 @@ fn restore_ordering<TPostings: PostingsWithBlockMax>(
|
||||
// If this works, return true.
|
||||
// If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the
|
||||
// pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`.
|
||||
// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and
|
||||
// return.
|
||||
fn align_scorers<TPostings: PostingsWithBlockMax>(
|
||||
term_scorers: &mut Vec<TermScorerWithMaxScore<TPostings>>,
|
||||
// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return.
|
||||
fn align_scorers(
|
||||
term_scorers: &mut Vec<TermScorerWithMaxScore>,
|
||||
pivot_doc: DocId,
|
||||
before_pivot_len: usize,
|
||||
) -> bool {
|
||||
@@ -131,10 +126,7 @@ fn align_scorers<TPostings: PostingsWithBlockMax>(
|
||||
// Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc).
|
||||
// Advance term_scorers[..pivot_len] and out of these removes the terminated scores.
|
||||
// Restores the ordering of term_scorers.
|
||||
fn advance_all_scorers_on_pivot<TPostings: PostingsWithBlockMax>(
|
||||
term_scorers: &mut Vec<TermScorerWithMaxScore<TPostings>>,
|
||||
pivot_len: usize,
|
||||
) {
|
||||
fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>, pivot_len: usize) {
|
||||
for term_scorer in &mut term_scorers[..pivot_len] {
|
||||
term_scorer.advance();
|
||||
}
|
||||
@@ -153,12 +145,12 @@ fn advance_all_scorers_on_pivot<TPostings: PostingsWithBlockMax>(
|
||||
/// Implements the WAND (Weak AND) algorithm for dynamic pruning
|
||||
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
|
||||
/// Link: <http://engineering.nyu.edu/~suel/papers/bmw.pdf>
|
||||
pub fn block_wand<TPostings: PostingsWithBlockMax>(
|
||||
mut scorers: Vec<TermScorer<TPostings>>,
|
||||
pub fn block_wand(
|
||||
mut scorers: Vec<TermScorer>,
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
||||
) {
|
||||
let mut scorers: Vec<TermScorerWithMaxScore<TPostings>> = scorers
|
||||
let mut scorers: Vec<TermScorerWithMaxScore> = scorers
|
||||
.iter_mut()
|
||||
.map(TermScorerWithMaxScore::from)
|
||||
.collect();
|
||||
@@ -174,7 +166,10 @@ pub fn block_wand<TPostings: PostingsWithBlockMax>(
|
||||
|
||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| scorer.seek_block_max(pivot_doc))
|
||||
.map(|scorer| {
|
||||
scorer.seek_block(pivot_doc);
|
||||
scorer.block_max_score()
|
||||
})
|
||||
.sum();
|
||||
|
||||
// Beware after shallow advance, skip readers can be in advance compared to
|
||||
@@ -225,22 +220,21 @@ pub fn block_wand<TPostings: PostingsWithBlockMax>(
|
||||
/// - On a block, advance until the end and execute `callback` when the doc score is greater or
|
||||
/// equal to the `threshold`.
|
||||
pub fn block_wand_single_scorer(
|
||||
mut scorer: TermScorer<impl PostingsWithBlockMax>,
|
||||
mut scorer: TermScorer,
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
||||
) {
|
||||
let mut doc = scorer.doc();
|
||||
let mut block_max_score = scorer.seek_block_max(doc);
|
||||
loop {
|
||||
// We position the scorer on a block that can reach
|
||||
// the threshold.
|
||||
while block_max_score < threshold {
|
||||
while scorer.block_max_score() < threshold {
|
||||
let last_doc_in_block = scorer.last_doc_in_block();
|
||||
if last_doc_in_block == TERMINATED {
|
||||
return;
|
||||
}
|
||||
doc = last_doc_in_block + 1;
|
||||
block_max_score = scorer.seek_block_max(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
// Seek will effectively load that block.
|
||||
doc = scorer.seek(doc);
|
||||
@@ -262,33 +256,31 @@ pub fn block_wand_single_scorer(
|
||||
}
|
||||
}
|
||||
doc += 1;
|
||||
block_max_score = scorer.seek_block_max(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
}
|
||||
|
||||
struct TermScorerWithMaxScore<'a, TPostings: PostingsWithBlockMax> {
|
||||
scorer: &'a mut TermScorer<TPostings>,
|
||||
struct TermScorerWithMaxScore<'a> {
|
||||
scorer: &'a mut TermScorer,
|
||||
max_score: Score,
|
||||
}
|
||||
|
||||
impl<'a, TPostings: PostingsWithBlockMax> From<&'a mut TermScorer<TPostings>>
|
||||
for TermScorerWithMaxScore<'a, TPostings>
|
||||
{
|
||||
fn from(scorer: &'a mut TermScorer<TPostings>) -> Self {
|
||||
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
|
||||
fn from(scorer: &'a mut TermScorer) -> Self {
|
||||
let max_score = scorer.max_score();
|
||||
TermScorerWithMaxScore { scorer, max_score }
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: PostingsWithBlockMax> Deref for TermScorerWithMaxScore<'_, TPostings> {
|
||||
type Target = TermScorer<TPostings>;
|
||||
impl Deref for TermScorerWithMaxScore<'_> {
|
||||
type Target = TermScorer;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: PostingsWithBlockMax> DerefMut for TermScorerWithMaxScore<'_, TPostings> {
|
||||
impl DerefMut for TermScorerWithMaxScore<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
@@ -1,19 +1,24 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::codec::SumOrDoNothingCombiner;
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::query::disjunction::Disjunction;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::for_each_docset_buffered;
|
||||
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::{
|
||||
box_scorer, intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude,
|
||||
Explanation, Occur, RequiredOptionalScorer, Scorer, SumCombiner, Weight,
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
|
||||
RequiredOptionalScorer, Scorer, Weight,
|
||||
};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
enum SpecializedScorer {
|
||||
TermUnion(Vec<TermScorer>),
|
||||
Other(Box<dyn Scorer>),
|
||||
}
|
||||
|
||||
fn scorer_disjunction<TScoreCombiner>(
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
score_combiner: TScoreCombiner,
|
||||
@@ -27,7 +32,7 @@ where
|
||||
if scorers.len() == 1 {
|
||||
return scorers.into_iter().next().unwrap(); // Safe unwrap.
|
||||
}
|
||||
box_scorer(Disjunction::new(
|
||||
Box::new(Disjunction::new(
|
||||
scorers,
|
||||
score_combiner,
|
||||
minimum_match_required,
|
||||
@@ -39,39 +44,57 @@ fn scorer_union<TScoreCombiner>(
|
||||
scorers: Vec<Box<dyn Scorer>>,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
num_docs: u32,
|
||||
reader: &dyn SegmentReader,
|
||||
) -> Box<dyn Scorer>
|
||||
) -> SpecializedScorer
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
match scorers.len() {
|
||||
0 => box_scorer(EmptyScorer),
|
||||
1 => scorers.into_iter().next().unwrap(),
|
||||
_ => {
|
||||
let combiner_opt: Option<SumOrDoNothingCombiner> = if std::any::TypeId::of::<
|
||||
TScoreCombiner,
|
||||
>() == std::any::TypeId::of::<
|
||||
SumCombiner,
|
||||
>() {
|
||||
Some(SumOrDoNothingCombiner::Sum)
|
||||
} else if std::any::TypeId::of::<TScoreCombiner>()
|
||||
== std::any::TypeId::of::<DoNothingCombiner>()
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 {
|
||||
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
if scorers
|
||||
.iter()
|
||||
.all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq)
|
||||
{
|
||||
Some(SumOrDoNothingCombiner::DoNothing)
|
||||
// Block wand is only available if we read frequencies.
|
||||
return SpecializedScorer::TermUnion(scorers);
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if let Some(combiner) = combiner_opt {
|
||||
reader.build_union_scorer_with_sum_combiner(scorers, num_docs, combiner)
|
||||
} else {
|
||||
box_scorer(BufferedUnionScorer::build(
|
||||
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
num_docs,
|
||||
))
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
num_docs,
|
||||
)))
|
||||
}
|
||||
|
||||
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
scorer: SpecializedScorer,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
num_docs: u32,
|
||||
) -> Box<dyn Scorer> {
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
|
||||
Box::new(union_scorer)
|
||||
}
|
||||
SpecializedScorer::Other(scorer) => scorer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the effective MUST scorer, accounting for removed AllScorers.
|
||||
@@ -87,7 +110,7 @@ fn effective_must_scorer(
|
||||
if must_scorers.is_empty() {
|
||||
if removed_all_scorer_count > 0 {
|
||||
// Had AllScorer(s) only - all docs match
|
||||
Some(box_scorer(AllScorer::new(max_doc)))
|
||||
Some(Box::new(AllScorer::new(max_doc)))
|
||||
} else {
|
||||
// No MUST constraint at all
|
||||
None
|
||||
@@ -105,26 +128,28 @@ fn effective_must_scorer(
|
||||
/// When `scoring_enabled` is false, we can just return AllScorer alone since
|
||||
/// we don't need score contributions from the should_scorer.
|
||||
fn effective_should_scorer_for_union<TScoreCombiner: ScoreCombiner>(
|
||||
should_scorer: Box<dyn Scorer>,
|
||||
should_scorer: SpecializedScorer,
|
||||
removed_all_scorer_count: usize,
|
||||
max_doc: DocId,
|
||||
num_docs: u32,
|
||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||
scoring_enabled: bool,
|
||||
) -> Box<dyn Scorer> {
|
||||
) -> SpecializedScorer {
|
||||
if removed_all_scorer_count > 0 {
|
||||
if scoring_enabled {
|
||||
// Need to union to get score contributions from both
|
||||
let all_scorers: Vec<Box<dyn Scorer>> =
|
||||
vec![should_scorer, box_scorer(AllScorer::new(max_doc))];
|
||||
box_scorer(BufferedUnionScorer::build(
|
||||
let all_scorers: Vec<Box<dyn Scorer>> = vec![
|
||||
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
|
||||
Box::new(AllScorer::new(max_doc)),
|
||||
];
|
||||
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
all_scorers,
|
||||
score_combiner_fn,
|
||||
num_docs,
|
||||
))
|
||||
)))
|
||||
} else {
|
||||
// Scoring disabled - AllScorer alone is sufficient
|
||||
box_scorer(AllScorer::new(max_doc))
|
||||
SpecializedScorer::Other(Box::new(AllScorer::new(max_doc)))
|
||||
}
|
||||
} else {
|
||||
should_scorer
|
||||
@@ -135,9 +160,9 @@ enum ShouldScorersCombinationMethod {
|
||||
// Should scorers are irrelevant.
|
||||
Ignored,
|
||||
// Only contributes to final score.
|
||||
Optional(Box<dyn Scorer>),
|
||||
Optional(SpecializedScorer),
|
||||
// Regardless of score, the should scorers may impact whether a document is matching or not.
|
||||
Required(Box<dyn Scorer>),
|
||||
Required(SpecializedScorer),
|
||||
}
|
||||
|
||||
/// Weight associated to the `BoolQuery`.
|
||||
@@ -180,7 +205,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
|
||||
fn per_occur_scorers(
|
||||
&self,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
boost: Score,
|
||||
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
|
||||
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
|
||||
@@ -196,10 +221,10 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
|
||||
fn complex_scorer<TComplexScoreCombiner: ScoreCombiner>(
|
||||
&self,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
boost: Score,
|
||||
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
||||
) -> crate::Result<Box<dyn Scorer>> {
|
||||
) -> crate::Result<SpecializedScorer> {
|
||||
let num_docs = reader.num_docs();
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||
|
||||
@@ -209,7 +234,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
let must_special_scorer_counts = remove_and_count_all_and_empty_scorers(&mut must_scorers);
|
||||
|
||||
if must_special_scorer_counts.num_empty_scorers > 0 {
|
||||
return Ok(box_scorer(EmptyScorer));
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
}
|
||||
|
||||
let mut should_scorers = per_occur_scorers.remove(&Occur::Should).unwrap_or_default();
|
||||
@@ -224,7 +249,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
|
||||
if exclude_special_scorer_counts.num_all_scorers > 0 {
|
||||
// We exclude all documents at one point.
|
||||
return Ok(box_scorer(EmptyScorer));
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
}
|
||||
|
||||
let effective_minimum_number_should_match = self
|
||||
@@ -236,7 +261,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
if effective_minimum_number_should_match > num_of_should_scorers {
|
||||
// We don't have enough scorers to satisfy the minimum number of should matches.
|
||||
// The request will match no documents.
|
||||
return Ok(box_scorer(EmptyScorer));
|
||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||
}
|
||||
match effective_minimum_number_should_match {
|
||||
0 if num_of_should_scorers == 0 => ShouldScorersCombinationMethod::Ignored,
|
||||
@@ -244,13 +269,11 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
should_scorers,
|
||||
&score_combiner_fn,
|
||||
num_docs,
|
||||
reader,
|
||||
)),
|
||||
1 => ShouldScorersCombinationMethod::Required(scorer_union(
|
||||
should_scorers,
|
||||
&score_combiner_fn,
|
||||
num_docs,
|
||||
reader,
|
||||
)),
|
||||
n if num_of_should_scorers == n => {
|
||||
// When num_of_should_scorers equals the number of should clauses,
|
||||
@@ -258,10 +281,12 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
must_scorers.append(&mut should_scorers);
|
||||
ShouldScorersCombinationMethod::Ignored
|
||||
}
|
||||
_ => ShouldScorersCombinationMethod::Required(scorer_disjunction(
|
||||
should_scorers,
|
||||
score_combiner_fn(),
|
||||
effective_minimum_number_should_match,
|
||||
_ => ShouldScorersCombinationMethod::Required(SpecializedScorer::Other(
|
||||
scorer_disjunction(
|
||||
should_scorers,
|
||||
score_combiner_fn(),
|
||||
effective_minimum_number_should_match,
|
||||
),
|
||||
)),
|
||||
}
|
||||
};
|
||||
@@ -278,8 +303,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
reader.max_doc(),
|
||||
num_docs,
|
||||
)
|
||||
.unwrap_or_else(|| box_scorer(EmptyScorer));
|
||||
boxed_scorer
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
(ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => {
|
||||
// Optional SHOULD: contributes to scoring but not required for matching.
|
||||
@@ -304,12 +329,16 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
Some(must_scorer) => {
|
||||
// Has MUST constraint: SHOULD only affects scoring.
|
||||
if self.scoring_enabled {
|
||||
box_scorer(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
|
||||
_,
|
||||
_,
|
||||
TScoreCombiner,
|
||||
>::new(
|
||||
must_scorer,
|
||||
should_scorer,
|
||||
))
|
||||
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
|
||||
)))
|
||||
} else {
|
||||
must_scorer
|
||||
SpecializedScorer::Other(must_scorer)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -329,7 +358,12 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
}
|
||||
Some(must_scorer) => {
|
||||
// Has MUST constraint: intersect MUST with SHOULD.
|
||||
intersect_scorers(vec![must_scorer, should_scorer], num_docs)
|
||||
let should_boxed =
|
||||
into_box_scorer(should_scorer, &score_combiner_fn, num_docs);
|
||||
SpecializedScorer::Other(intersect_scorers(
|
||||
vec![must_scorer, should_boxed],
|
||||
num_docs,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -338,18 +372,19 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
return Ok(include_scorer);
|
||||
}
|
||||
|
||||
let include_scorer_boxed = into_box_scorer(include_scorer, &score_combiner_fn, num_docs);
|
||||
let scorer: Box<dyn Scorer> = if exclude_scorers.len() == 1 {
|
||||
let exclude_scorer = exclude_scorers.pop().unwrap();
|
||||
match exclude_scorer.downcast::<TermScorer>() {
|
||||
// Cast to TermScorer succeeded
|
||||
Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer, *exclude_scorer)),
|
||||
Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, *exclude_scorer)),
|
||||
// We get back the original Box<dyn Scorer>
|
||||
Err(exclude_scorer) => Box::new(Exclude::new(include_scorer, exclude_scorer)),
|
||||
Err(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, exclude_scorer)),
|
||||
}
|
||||
} else {
|
||||
Box::new(Exclude::new(include_scorer, exclude_scorers))
|
||||
Box::new(Exclude::new(include_scorer_boxed, exclude_scorers))
|
||||
};
|
||||
Ok(scorer)
|
||||
Ok(SpecializedScorer::Other(scorer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -378,7 +413,8 @@ fn remove_and_count_all_and_empty_scorers(
|
||||
}
|
||||
|
||||
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let num_docs = reader.num_docs();
|
||||
if self.weights.is_empty() {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
} else if self.weights.len() == 1 {
|
||||
@@ -390,12 +426,18 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
}
|
||||
} else if self.scoring_enabled {
|
||||
self.complex_scorer(reader, boost, &self.score_combiner_fn)
|
||||
.map(|specialized_scorer| {
|
||||
into_box_scorer(specialized_scorer, &self.score_combiner_fn, num_docs)
|
||||
})
|
||||
} else {
|
||||
self.complex_scorer(reader, boost, DoNothingCombiner::default)
|
||||
.map(|specialized_scorer| {
|
||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
@@ -417,22 +459,47 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
|
||||
fn for_each(
|
||||
&self,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
scorer.for_each(callback);
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_scorer(scorer.as_mut(), callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn for_each_no_score(
|
||||
&self,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let mut scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -449,11 +516,18 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
fn for_each_pruning(
|
||||
&self,
|
||||
threshold: Score,
|
||||
reader: &dyn SegmentReader,
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
reader.for_each_pruning(threshold, scorer, callback);
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
super::block_wand(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
mod block_wand;
|
||||
mod boolean_query;
|
||||
mod boolean_weight;
|
||||
|
||||
pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer};
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boolean_weight::BooleanWeight;
|
||||
|
||||
|
||||
@@ -67,11 +67,11 @@ impl BoostWeight {
|
||||
}
|
||||
|
||||
impl Weight for BoostWeight {
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
self.weight.scorer(reader, boost * self.boost)
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
|
||||
let underlying_explanation = self.weight.explain(reader, doc)?;
|
||||
let score = underlying_explanation.value() * self.boost;
|
||||
let mut explanation =
|
||||
@@ -80,7 +80,7 @@ impl Weight for BoostWeight {
|
||||
Ok(explanation)
|
||||
}
|
||||
|
||||
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
|
||||
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
|
||||
self.weight.count(reader)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::fmt;
|
||||
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
|
||||
|
||||
/// `ConstScoreQuery` is a wrapper over a query to provide a constant score.
|
||||
@@ -63,15 +63,12 @@ impl ConstWeight {
|
||||
}
|
||||
|
||||
impl Weight for ConstWeight {
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let inner_scorer = self.weight.scorer(reader, boost)?;
|
||||
Ok(box_scorer(ConstScorer::new(
|
||||
inner_scorer,
|
||||
boost * self.score,
|
||||
)))
|
||||
Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
@@ -84,7 +81,7 @@ impl Weight for ConstWeight {
|
||||
Ok(explanation)
|
||||
}
|
||||
|
||||
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
|
||||
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
|
||||
self.weight.count(reader)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::Scorer;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Weight};
|
||||
use crate::query::{EnableScoring, Explanation, Query, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher};
|
||||
|
||||
/// `EmptyQuery` is a dummy `Query` in which no document matches.
|
||||
@@ -26,11 +26,11 @@ impl Query for EmptyQuery {
|
||||
/// It is useful for tests and handling edge cases.
|
||||
pub struct EmptyWeight;
|
||||
impl Weight for EmptyWeight {
|
||||
fn scorer(&self, _reader: &dyn SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
Ok(box_scorer(EmptyScorer))
|
||||
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
|
||||
fn explain(&self, _reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
Err(does_not_match(doc))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use core::fmt::Debug;
|
||||
use columnar::{ColumnIndex, DynamicColumn};
|
||||
use common::BitSet;
|
||||
|
||||
use super::{box_scorer, ConstScorer, EmptyScorer};
|
||||
use super::{ConstScorer, EmptyScorer};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::all_query::AllScorer;
|
||||
@@ -98,7 +98,7 @@ pub struct ExistsWeight {
|
||||
}
|
||||
|
||||
impl Weight for ExistsWeight {
|
||||
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let fast_field_reader = reader.fast_fields();
|
||||
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
|
||||
if self.field_type == Type::Json && self.json_subpaths {
|
||||
@@ -117,7 +117,7 @@ impl Weight for ExistsWeight {
|
||||
}
|
||||
}
|
||||
if non_empty_columns.is_empty() {
|
||||
return Ok(box_scorer(EmptyScorer));
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
|
||||
// If any column is full, all docs match.
|
||||
@@ -128,9 +128,9 @@ impl Weight for ExistsWeight {
|
||||
{
|
||||
let all_scorer = AllScorer::new(max_doc);
|
||||
if boost != 1.0f32 {
|
||||
return Ok(box_scorer(BoostScorer::new(all_scorer, boost)));
|
||||
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
|
||||
} else {
|
||||
return Ok(box_scorer(all_scorer));
|
||||
return Ok(Box::new(all_scorer));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,7 +138,7 @@ impl Weight for ExistsWeight {
|
||||
// NOTE: A lower number may be better for very sparse columns
|
||||
if non_empty_columns.len() < 4 {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
return Ok(box_scorer(ConstScorer::new(docset, boost)));
|
||||
return Ok(Box::new(ConstScorer::new(docset, boost)));
|
||||
}
|
||||
|
||||
// If we have many dynamic columns, precompute a bitset of matching docs
|
||||
@@ -162,10 +162,10 @@ impl Weight for ExistsWeight {
|
||||
}
|
||||
}
|
||||
let docset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(box_scorer(ConstScorer::new(docset, boost)))
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user