mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-05 00:32:55 +00:00
Compare commits
2 Commits
term_agg_b
...
trinity--p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
116c6d3621 | ||
|
|
0f20787917 |
@@ -11,7 +11,7 @@ use crate::aggregation::agg_req_with_accessor::{
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
@@ -268,18 +268,21 @@ impl TermBuckets {
|
||||
term_ids: &[u64],
|
||||
doc: DocId,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
bucket_count: &BucketCount,
|
||||
blueprint: &Option<SegmentAggregationResultsCollector>,
|
||||
) -> crate::Result<()> {
|
||||
for &term_id in term_ids {
|
||||
let entry = self
|
||||
.entries
|
||||
.entry(term_id as u32)
|
||||
.or_insert_with(|| TermBucketEntry::from_blueprint(blueprint));
|
||||
let entry = self.entries.entry(term_id as u32).or_insert_with(|| {
|
||||
bucket_count.add_count(1);
|
||||
|
||||
TermBucketEntry::from_blueprint(blueprint)
|
||||
});
|
||||
entry.doc_count += 1;
|
||||
if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() {
|
||||
sub_aggregations.collect(doc, sub_aggregation)?;
|
||||
}
|
||||
}
|
||||
bucket_count.validate_bucket_count()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -369,7 +372,7 @@ impl SegmentTermCollector {
|
||||
}
|
||||
OrderTarget::SubAggregation(_name) => {
|
||||
// don't sort and cut off since it's hard to make assumptions on the quality of the
|
||||
// results when cutting off due to unknown nature of the sub_aggregation (possible
|
||||
// results when cutting off du to unknown nature of the sub_aggregation (possible
|
||||
// to check).
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
@@ -409,10 +412,6 @@ impl SegmentTermCollector {
|
||||
if self.req.min_doc_count == 0 {
|
||||
let mut stream = term_dict.stream()?;
|
||||
while let Some((key, _ord)) = stream.next() {
|
||||
if dict.len() >= self.req.segment_size as usize {
|
||||
break;
|
||||
}
|
||||
|
||||
let key = std::str::from_utf8(key)
|
||||
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?;
|
||||
if !dict.contains_key(key) {
|
||||
@@ -434,8 +433,6 @@ impl SegmentTermCollector {
|
||||
sum_other_doc_count += sum_other_docs;
|
||||
dict = dict_entries.into_iter().collect();
|
||||
}
|
||||
agg_with_accessor.bucket_count.add_count(dict.len() as u32);
|
||||
agg_with_accessor.bucket_count.validate_bucket_count()?;
|
||||
|
||||
Ok(IntermediateBucketResult::Terms(
|
||||
IntermediateTermBucketResult {
|
||||
@@ -472,24 +469,28 @@ impl SegmentTermCollector {
|
||||
&vals1,
|
||||
docs[0],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
&vals2,
|
||||
docs[1],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
&vals3,
|
||||
docs[2],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
&vals4,
|
||||
docs[3],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
}
|
||||
@@ -500,6 +501,7 @@ impl SegmentTermCollector {
|
||||
&vals1,
|
||||
doc,
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
}
|
||||
@@ -1134,33 +1136,6 @@ mod tests {
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "string_id".to_string(),
|
||||
min_doc_count: Some(0),
|
||||
size: Some(1),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
// searching for terma, but min_doc_count will return all terms
|
||||
let res = exec_request_with_query(agg_req, &index, Some(("string_id", "terma")))?;
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][1]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1239,27 +1214,6 @@ mod tests {
|
||||
|
||||
let index = get_test_index_from_terms(true, &terms_per_segment)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "string_id".to_string(),
|
||||
// min_doc_count: Some(0),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None);
|
||||
|
||||
assert!(res.is_ok());
|
||||
|
||||
// This request has min_doc_count set to 0
|
||||
// That means we load potentially the whole dict
|
||||
// Make sure the bucket count is still fine
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
@@ -1274,24 +1228,6 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "string_id".to_string(),
|
||||
// min_doc_count: Some(0),
|
||||
size: Some(70_000),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None);
|
||||
|
||||
assert!(res.is_err());
|
||||
@@ -1448,10 +1384,14 @@ mod bench {
|
||||
let mut collector = get_collector_with_buckets(total_terms);
|
||||
let vals = get_rand_terms(total_terms, num_terms);
|
||||
let aggregations_with_accessor: AggregationsWithAccessor = Default::default();
|
||||
let bucket_count: BucketCount = BucketCount {
|
||||
bucket_count: Default::default(),
|
||||
max_bucket_count: 1_000_001u32,
|
||||
};
|
||||
b.iter(|| {
|
||||
for &val in &vals {
|
||||
collector
|
||||
.increment_bucket(&[val], 0, &aggregations_with_accessor, &None)
|
||||
.increment_bucket(&[val], 0, &aggregations_with_accessor, &bucket_count, &None)
|
||||
.unwrap();
|
||||
}
|
||||
})
|
||||
|
||||
@@ -249,7 +249,7 @@ impl SearcherInner {
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
doc_store_cache_size: usize,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
) -> io::Result<SearcherInner> {
|
||||
assert_eq!(
|
||||
&segment_readers
|
||||
@@ -261,7 +261,7 @@ impl SearcherInner {
|
||||
);
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
|
||||
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks))
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(SearcherInner {
|
||||
|
||||
@@ -134,9 +134,12 @@ impl SegmentReader {
|
||||
&self.fieldnorm_readers
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self, cache_size: usize) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone(), cache_size)
|
||||
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
|
||||
///
|
||||
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
|
||||
/// The size of blocks is configurable, this should be reflexted in the
|
||||
pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone(), cache_num_blocks)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
|
||||
@@ -44,7 +44,7 @@ pub struct IndexReaderBuilder {
|
||||
index: Index,
|
||||
warmers: Vec<Weak<dyn Warmer>>,
|
||||
num_warming_threads: usize,
|
||||
doc_store_cache_size: usize,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
}
|
||||
|
||||
impl IndexReaderBuilder {
|
||||
@@ -55,7 +55,7 @@ impl IndexReaderBuilder {
|
||||
index,
|
||||
warmers: Vec::new(),
|
||||
num_warming_threads: 1,
|
||||
doc_store_cache_size: DOCSTORE_CACHE_CAPACITY,
|
||||
doc_store_cache_num_blocks: DOCSTORE_CACHE_CAPACITY,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,7 +72,7 @@ impl IndexReaderBuilder {
|
||||
searcher_generation_inventory.clone(),
|
||||
)?;
|
||||
let inner_reader = InnerIndexReader::new(
|
||||
self.doc_store_cache_size,
|
||||
self.doc_store_cache_num_blocks,
|
||||
self.index,
|
||||
warming_state,
|
||||
searcher_generation_inventory,
|
||||
@@ -119,8 +119,11 @@ impl IndexReaderBuilder {
|
||||
///
|
||||
/// The doc store readers cache by default DOCSTORE_CACHE_CAPACITY(100) decompressed blocks.
|
||||
#[must_use]
|
||||
pub fn doc_store_cache_size(mut self, doc_store_cache_size: usize) -> IndexReaderBuilder {
|
||||
self.doc_store_cache_size = doc_store_cache_size;
|
||||
pub fn doc_store_cache_num_blocks(
|
||||
mut self,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
) -> IndexReaderBuilder {
|
||||
self.doc_store_cache_num_blocks = doc_store_cache_num_blocks;
|
||||
self
|
||||
}
|
||||
|
||||
@@ -151,7 +154,7 @@ impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
}
|
||||
|
||||
struct InnerIndexReader {
|
||||
doc_store_cache_size: usize,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
index: Index,
|
||||
warming_state: WarmingState,
|
||||
searcher: arc_swap::ArcSwap<SearcherInner>,
|
||||
@@ -161,7 +164,7 @@ struct InnerIndexReader {
|
||||
|
||||
impl InnerIndexReader {
|
||||
fn new(
|
||||
doc_store_cache_size: usize,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
index: Index,
|
||||
warming_state: WarmingState,
|
||||
// The searcher_generation_inventory is not used as source, but as target to track the
|
||||
@@ -172,13 +175,13 @@ impl InnerIndexReader {
|
||||
|
||||
let searcher = Self::create_searcher(
|
||||
&index,
|
||||
doc_store_cache_size,
|
||||
doc_store_cache_num_blocks,
|
||||
&warming_state,
|
||||
&searcher_generation_counter,
|
||||
&searcher_generation_inventory,
|
||||
)?;
|
||||
Ok(InnerIndexReader {
|
||||
doc_store_cache_size,
|
||||
doc_store_cache_num_blocks,
|
||||
index,
|
||||
warming_state,
|
||||
searcher: ArcSwap::from(searcher),
|
||||
@@ -214,7 +217,7 @@ impl InnerIndexReader {
|
||||
|
||||
fn create_searcher(
|
||||
index: &Index,
|
||||
doc_store_cache_size: usize,
|
||||
doc_store_cache_num_blocks: usize,
|
||||
warming_state: &WarmingState,
|
||||
searcher_generation_counter: &Arc<AtomicU64>,
|
||||
searcher_generation_inventory: &Inventory<SearcherGeneration>,
|
||||
@@ -232,7 +235,7 @@ impl InnerIndexReader {
|
||||
index.clone(),
|
||||
segment_readers,
|
||||
searcher_generation,
|
||||
doc_store_cache_size,
|
||||
doc_store_cache_num_blocks,
|
||||
)?);
|
||||
|
||||
warming_state.warm_new_searcher_generation(&searcher.clone().into())?;
|
||||
@@ -242,7 +245,7 @@ impl InnerIndexReader {
|
||||
fn reload(&self) -> crate::Result<()> {
|
||||
let searcher = Self::create_searcher(
|
||||
&self.index,
|
||||
self.doc_store_cache_size,
|
||||
self.doc_store_cache_num_blocks,
|
||||
&self.warming_state,
|
||||
&self.searcher_generation_counter,
|
||||
&self.searcher_generation_inventory,
|
||||
|
||||
@@ -375,7 +375,7 @@ where B: AsRef<[u8]>
|
||||
///
|
||||
/// Do NOT rely on this byte representation in the index.
|
||||
/// This value is likely to change in the future.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//! order to be handled in the `Store`.
|
||||
//!
|
||||
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
|
||||
//! When the buffer exceeds 16K, the buffer is compressed using `brotli`, `LZ4` or `snappy`
|
||||
//! and the resulting block is written to disk.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
|
||||
//! `LZ4` or `snappy` and the resulting block is written to disk.
|
||||
//!
|
||||
//! One can then request for a specific `DocId`.
|
||||
//! A skip list helps navigating to the right block,
|
||||
@@ -28,8 +28,6 @@
|
||||
//! - at the segment level, the
|
||||
//! [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
|
||||
//! - at the index level, the [`Searcher::doc()`](crate::Searcher::doc) method
|
||||
//!
|
||||
//! !
|
||||
|
||||
mod compressors;
|
||||
mod decompressors;
|
||||
|
||||
@@ -114,7 +114,10 @@ impl Sum for CacheStats {
|
||||
|
||||
impl StoreReader {
|
||||
/// Opens a store reader
|
||||
pub fn open(store_file: FileSlice, cache_size: usize) -> io::Result<StoreReader> {
|
||||
///
|
||||
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
|
||||
/// The size of blocks is configurable, this should be reflexted in the
|
||||
pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result<StoreReader> {
|
||||
let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?;
|
||||
|
||||
let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize);
|
||||
@@ -125,8 +128,8 @@ impl StoreReader {
|
||||
decompressor: footer.decompressor,
|
||||
data: data_file,
|
||||
cache: BlockCache {
|
||||
cache: NonZeroUsize::new(cache_size)
|
||||
.map(|cache_size| Mutex::new(LruCache::new(cache_size))),
|
||||
cache: NonZeroUsize::new(cache_num_blocks)
|
||||
.map(|cache_num_blocks| Mutex::new(LruCache::new(cache_num_blocks))),
|
||||
cache_hits: Default::default(),
|
||||
cache_misses: Default::default(),
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user