From 3cbcb5d0aae28fb5556ddcb67930aac0a5d4faad Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 30 Mar 2026 11:03:48 +0800 Subject: [PATCH] Abstract tantivy's data storage behind traits for pluggable backends Extract trait interfaces from tantivy's core reader types so that alternative storage backends (e.g. Quickwit) can provide their own implementations while tantivy's query engine works through dynamic dispatch. Reader trait extraction: - SegmentReader is now a trait; the concrete implementation is renamed to TantivySegmentReader. - DynInvertedIndexReader trait for object-safe dynamic dispatch, plus a typed InvertedIndexReader trait with associated Postings/DocSet types for static dispatch. The concrete reader becomes TantivyInvertedIndexReader. - StoreReader is now a trait; the concrete implementation is renamed to TantivyStoreReader. get() returns TantivyDocument directly instead of requiring a generic DocumentDeserialize bound. Typed downcast for performance-critical paths: - try_downcast_and_call() + TypedInvertedIndexReaderCb allow query weights (TermWeight, PhraseWeight) to attempt a downcast to the concrete TantivyInvertedIndexReader, obtaining typed postings for zero-cost scoring, and falling back to the dynamic path otherwise. - TermScorer is now generic over its postings type. - PostingsWithBlockMax trait enables block-max WAND acceleration through the trait boundary. - block_wand() and block_wand_single_scorer() are generic over PostingsWithBlockMax, and for_each_pruning is dispatched through the SegmentReader trait so custom backends can provide their own block-max implementations. Searcher decoupled from Index: - New SearcherContext holds schema, executor, and tokenizers. - Searcher can be constructed from Vec> via Searcher::from_segment_readers(), without needing an Index. - Searcher::index() is deprecated in favor of Searcher::context(). Postings and DocSet changes: - Postings trait gains doc_freq() -> DocFreq (Exact/Approximate) and has_freq(). - RawPostingsData struct carries raw postings bytes across the trait boundary for custom reader implementations. - BlockSegmentPostings::open() takes OwnedBytes instead of FileSlice. - DocSet gains fill_bitset() method. Scorer improvements: - Scorer trait absorbs for_each, for_each_pruning, and explain (previously free functions or on Weight). - box_scorer() helper avoids double-boxing Box. - BoxedTermScorer wraps a type-erased term scorer. - BufferedUnionScorer initialization fixed to avoid an extra advance() on construction. Other changes: - Document::to_json() now returns serde_json::Value; the old string serialization is renamed to to_serialized_json(). - DocumentDeserialize removed from the store reader public API. --- Cargo.toml | 5 + benches/fill_bitset.rs | 106 ++ benches/str_search_and_get.rs | 3 +- common/src/bitset.rs | 11 + examples/custom_collector.rs | 2 +- examples/date_time_field.rs | 2 +- examples/faceted_search_with_tweaked_score.rs | 4 +- examples/iterating_docs_and_positions.rs | 38 +- examples/phrase_prefix_search.rs | 2 +- examples/snippet.rs | 2 +- examples/warmer.rs | 4 +- src/aggregation/accessor_helpers.rs | 6 +- src/aggregation/agg_data.rs | 14 +- src/aggregation/bucket/composite/accessors.rs | 2 +- src/aggregation/bucket/filter.rs | 12 +- src/aggregation/bucket/term_agg.rs | 2 +- src/aggregation/collector.rs | 6 +- src/collector/count_collector.rs | 2 +- src/collector/docset_collector.rs | 4 +- src/collector/facet_collector.rs | 2 +- src/collector/filter_collector_wrapper.rs | 4 +- src/collector/histogram_collector.rs | 4 +- src/collector/mod.rs | 14 +- src/collector/multi_collector.rs | 4 +- src/collector/sort_key/order.rs | 6 +- src/collector/sort_key/sort_by_bytes.rs | 2 +- src/collector/sort_key/sort_by_erased_type.rs | 4 +- src/collector/sort_key/sort_by_score.rs | 6 +- .../sort_key/sort_by_static_fast_value.rs | 2 +- src/collector/sort_key/sort_by_string.rs | 4 +- src/collector/sort_key/sort_key_computer.rs | 22 +- src/collector/sort_key_top_collector.rs | 8 +- src/collector/tests.rs | 8 +- src/collector/top_score_collector.rs | 10 +- src/core/json_utils.rs | 6 +- src/core/mod.rs | 2 +- src/core/searcher.rs | 216 +++- src/core/tests.rs | 38 +- src/directory/composite_file.rs | 4 +- src/docset.rs | 58 +- src/fastfield/facet_reader.rs | 6 +- src/fastfield/mod.rs | 4 +- src/fastfield/readers.rs | 9 +- src/index/index.rs | 33 +- src/index/index_meta.rs | 66 +- src/index/inverted_index_reader.rs | 923 ++++++++++++------ src/index/mod.rs | 8 +- src/index/segment.rs | 2 +- src/index/segment_id.rs | 2 +- src/index/segment_reader.rs | 460 +++++---- src/indexer/delete_queue.rs | 8 +- src/indexer/index_writer.rs | 38 +- src/indexer/merge_index_test.rs | 27 +- src/indexer/merger.rs | 201 ++-- src/indexer/segment_updater.rs | 24 +- src/indexer/segment_writer.rs | 22 +- src/indexer/single_segment_index_writer.rs | 6 +- src/lib.rs | 16 +- src/postings/block_segment_postings.rs | 380 +++---- src/postings/json_postings_writer.rs | 6 - src/postings/loaded_postings.rs | 22 +- src/postings/mod.rs | 84 +- src/postings/per_field_postings_writer.rs | 19 +- src/postings/postings.rs | 41 + src/postings/postings_writer.rs | 148 ++- src/postings/recorder.rs | 8 +- src/postings/segment_postings.rs | 127 +-- src/postings/serializer.rs | 2 +- src/postings/skip.rs | 17 - src/query/all_query.rs | 10 +- src/query/automaton_weight.rs | 47 +- src/query/bitset/mod.rs | 7 + src/query/boolean_query/block_wand.rs | 62 +- src/query/boolean_query/boolean_weight.rs | 240 ++--- src/query/boolean_query/mod.rs | 20 +- src/query/boost_query.rs | 6 +- src/query/const_score_query.rs | 13 +- src/query/empty_query.rs | 8 +- src/query/exist_query.rs | 16 +- src/query/intersection.rs | 10 +- src/query/mod.rs | 12 +- src/query/more_like_this/more_like_this.rs | 6 +- .../phrase_prefix_scorer.rs | 16 +- .../phrase_prefix_weight.rs | 61 +- src/query/phrase_query/mod.rs | 2 +- src/query/phrase_query/phrase_query.rs | 2 +- src/query/phrase_query/phrase_scorer.rs | 32 +- src/query/phrase_query/phrase_weight.rs | 126 ++- src/query/phrase_query/regex_phrase_weight.rs | 57 +- src/query/query.rs | 2 +- src/query/range_query/range_query.rs | 31 +- .../range_query/range_query_fastfield.rs | 37 +- src/query/scorer.rs | 88 +- src/query/term_query/mod.rs | 4 +- src/query/term_query/term_scorer.rs | 186 ++-- src/query/term_query/term_weight.rs | 127 ++- src/query/union/bitset_union.rs | 15 +- src/query/union/buffered_union.rs | 134 ++- src/query/union/simple_union.rs | 24 +- src/query/weight.rs | 63 +- src/reader/mod.rs | 22 +- src/schema/document/default_document.rs | 5 +- src/schema/document/mod.rs | 27 +- src/schema/document/owned_value.rs | 20 +- src/schema/field_type.rs | 5 +- src/schema/index_record_option.rs | 2 +- src/schema/schema.rs | 15 +- src/schema/term.rs | 2 +- src/snippet/mod.rs | 2 +- src/space_usage/mod.rs | 8 +- src/store/index/mod.rs | 4 +- src/store/mod.rs | 45 +- src/store/reader.rs | 137 ++- src/store/store_compressor.rs | 59 +- src/store/writer.rs | 30 +- 115 files changed, 3250 insertions(+), 1957 deletions(-) create mode 100644 benches/fill_bitset.rs diff --git a/Cargo.toml b/Cargo.toml index ee308b842..1815edacb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -201,3 +201,8 @@ harness = false [[bench]] name = "regex_all_terms" harness = false + +[[bench]] +name = "fill_bitset" +harness = false + diff --git a/benches/fill_bitset.rs b/benches/fill_bitset.rs new file mode 100644 index 000000000..0f2c7ea53 --- /dev/null +++ b/benches/fill_bitset.rs @@ -0,0 +1,106 @@ +use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use common::BitSet; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy::postings::BlockSegmentPostings; +use tantivy::schema::*; +use tantivy::{doc, DocSet as _, Index, InvertedIndexReader as _, TantivyDocument}; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +fn main() { + let index = build_test_index(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = &searcher.segment_readers()[0]; + let text_field = index.schema().get_field("text").unwrap(); + let inverted_index = segment_reader.inverted_index(text_field).unwrap(); + let max_doc = segment_reader.max_doc(); + + let term = Term::from_field_text(text_field, "hello"); + let term_info = inverted_index.get_term_info(&term).unwrap().unwrap(); + + let mut runner = BenchRunner::new(); + runner.set_name("fill_bitset"); + + let mut group = runner.new_group(); + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This is the path used by queries (AutomatonWeight, RangeQuery, etc.) + // It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo. + group.register("fill_bitset_from_terminfo (via trait)", move |_| { + let mut bitset = BitSet::with_max_value(max_doc); + inverted_index + .fill_bitset_from_terminfo(term_info, &mut bitset) + .unwrap(); + black_box(bitset); + }); + } + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset. + group.register("read_docset + fill_bitset", move |_| { + let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap(); + let mut bitset = BitSet::with_max_value(max_doc); + postings.fill_bitset(&mut bitset); + black_box(bitset); + }); + } + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely. + group.register("BlockSegmentPostings direct", move |_| { + let raw = inverted_index + .read_raw_postings_data(term_info, IndexRecordOption::Basic) + .unwrap(); + let mut block_postings = BlockSegmentPostings::open( + term_info.doc_freq, + raw.postings_data, + raw.record_option, + raw.effective_option, + ) + .unwrap(); + let mut bitset = BitSet::with_max_value(max_doc); + loop { + let docs = block_postings.docs(); + if docs.is_empty() { + break; + } + for &doc in docs { + bitset.insert(doc); + } + block_postings.advance(); + } + black_box(bitset); + }); + } + group.run(); +} + +fn build_test_index() -> Index { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let text_field = schema.get_field("text").unwrap(); + + let mut writer = index.writer::(250_000_000).unwrap(); + let mut rng = StdRng::from_seed([42u8; 32]); + for _ in 0..100_000 { + if rng.random_bool(0.5) { + writer + .add_document(doc!(text_field => "hello world")) + .unwrap(); + } else { + writer + .add_document(doc!(text_field => "goodbye world")) + .unwrap(); + } + } + writer.commit().unwrap(); + index +} diff --git a/benches/str_search_and_get.rs b/benches/str_search_and_get.rs index ffb9768cd..d6daa7852 100644 --- a/benches/str_search_and_get.rs +++ b/benches/str_search_and_get.rs @@ -17,7 +17,6 @@ use rand::rngs::StdRng; use rand::SeedableRng; use tantivy::collector::{Count, DocSetCollector}; use tantivy::query::RangeQuery; -use tantivy::schema::document::TantivyDocument; use tantivy::schema::{Schema, Value, FAST, STORED, STRING}; use tantivy::{doc, Index, ReloadPolicy, Searcher, Term}; @@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask { for doc_address in docs { // Get the document from the doc store (row store access) - if let Ok(doc) = self.searcher.doc::(doc_address) { + if let Ok(doc) = self.searcher.doc(doc_address) { // Extract string values from the stored field if let Some(field_value) = doc.get_first(str_stored_field) { if let Some(text) = field_value.as_value().as_str() { diff --git a/common/src/bitset.rs b/common/src/bitset.rs index cf719e53a..aa2021165 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -193,6 +193,8 @@ impl TinySet { #[derive(Clone)] pub struct BitSet { tinysets: Box<[TinySet]>, + // Tracking `len` on every insert/remove adds overhead even when `len()` is never called. + // Consider removing if `len()` usage is rare or not on a hot path. len: u64, max_value: u32, } @@ -252,6 +254,7 @@ impl BitSet { /// Removes all elements from the `BitSet`. pub fn clear(&mut self) { + self.len = 0; for tinyset in self.tinysets.iter_mut() { *tinyset = TinySet::empty(); } @@ -271,6 +274,11 @@ impl BitSet { } } + /// Estimate the heap memory consumption of this `BitSet` in bytes. + pub fn get_memory_consumption(&self) -> usize { + self.tinysets.len() * std::mem::size_of::() + } + /// Returns the number of elements in the `BitSet`. #[inline] pub fn len(&self) -> usize { @@ -314,6 +322,9 @@ impl BitSet { .map(|delta_bucket| bucket + delta_bucket as u32) } + /// Returns the maximum number of elements in the bitset. + /// + /// Warning: The largest element the bitset can contain is `max_value - 1`. #[inline] pub fn max_value(&self) -> u32 { self.max_value diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 29b606930..355e134ba 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -70,7 +70,7 @@ impl Collector for StatsCollector { fn for_segment( &self, _segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> tantivy::Result { let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?; Ok(StatsSegmentCollector { diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs index a5da06c9c..530e5ddf1 100644 --- a/examples/date_time_field.rs +++ b/examples/date_time_field.rs @@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> { let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?; assert_eq!(count_docs.len(), 1); for (_score, doc_address) in count_docs { - let retrieved_doc = searcher.doc::(doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; assert!(retrieved_doc .get_first(occurred_at) .unwrap() diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index d21a1c3d4..84eeb060b 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> { ); let top_docs_by_custom_score = // Call TopDocs with a custom tweak score - TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| { + TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| { let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap(); let facet_dict = ingredient_reader.facet_dict(); @@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> { .iter() .map(|(_, doc_id)| { searcher - .doc::(*doc_id) + .doc(*doc_id) .unwrap() .get_first(title) .and_then(|v| v.as_str().map(|el| el.to_string())) diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index 36bc4371c..abee516cd 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> { } } - // A `Term` is a text token associated with a field. - // Let's go through all docs containing the term `title:the` and access their position - let term_the = Term::from_field_text(title, "the"); - - // Some other powerful operations (especially `.skip_to`) may be useful to consume these + // Some other powerful operations (especially `.seek`) may be useful to consume these // posting lists rapidly. // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait - // Also, for some VERY specific high performance use case like an OLAP analysis of logs, - // you can get better performance by accessing directly the blocks of doc ids. - for segment_reader in searcher.segment_readers() { - // A segment contains different data structure. - // Inverted index stands for the combination of - // - the term dictionary - // - the inverted lists associated with each terms and their positions - let inverted_index = segment_reader.inverted_index(title)?; - - // This segment posting object is like a cursor over the documents matching the term. - // The `IndexRecordOption` arguments tells tantivy we will be interested in both term - // frequencies and positions. - // - // If you don't need all this information, you may get better performance by decompressing - // less information. - if let Some(mut block_segment_postings) = - inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)? - { - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - // Once again these docs MAY contains deleted documents as well. - let docs = block_segment_postings.docs(); - // Prints `Docs [0, 2].` - println!("Docs {docs:?}"); - block_segment_postings.advance(); - } - } - } - Ok(()) } diff --git a/examples/phrase_prefix_search.rs b/examples/phrase_prefix_search.rs index e2e1922cb..81b754fa5 100644 --- a/examples/phrase_prefix_search.rs +++ b/examples/phrase_prefix_search.rs @@ -67,7 +67,7 @@ fn main() -> Result<()> { let mut titles = top_docs .into_iter() .map(|(_score, doc_address)| { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let title = doc .get_first(title) .and_then(|v| v.as_str()) diff --git a/examples/snippet.rs b/examples/snippet.rs index 04edee82f..dd5e55d57 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> { let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; for (score, doc_address) in top_docs { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let snippet = snippet_generator.snippet_from_doc(&doc); println!("Document score {score}:"); println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap()); diff --git a/examples/warmer.rs b/examples/warmer.rs index c7543114a..53f2f5ffa 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -43,7 +43,7 @@ impl DynamicPriceColumn { } } - pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option>> { + pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option>> { let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp()); self.price_cache.read().unwrap().get(&segment_key).cloned() } @@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> { let query = query_parser.parse_query("cooking")?; let searcher = reader.searcher(); - let score_by_price = move |segment_reader: &SegmentReader| { + let score_by_price = move |segment_reader: &dyn SegmentReader| { let price = price_dynamic_column .price_for_segment(segment_reader) .unwrap(); diff --git a/src/aggregation/accessor_helpers.rs b/src/aggregation/accessor_helpers.rs index fa51041e4..22e13ac16 100644 --- a/src/aggregation/accessor_helpers.rs +++ b/src/aggregation/accessor_helpers.rs @@ -57,7 +57,7 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] { /// Get fast field reader or empty as default. pub(crate) fn get_ff_reader( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, ) -> crate::Result<(columnar::Column, ColumnType)> { @@ -74,7 +74,7 @@ pub(crate) fn get_ff_reader( } pub(crate) fn get_dynamic_columns( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, ) -> crate::Result> { let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?; @@ -90,7 +90,7 @@ pub(crate) fn get_dynamic_columns( /// /// Is guaranteed to return at least one column. pub(crate) fn get_all_ff_reader_or_empty( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, fallback_type: ColumnType, diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index ffe812129..15db902ab 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -520,7 +520,7 @@ impl AggKind { /// Build AggregationsData by walking the request tree. pub(crate) fn build_aggregations_data_from_req( aggs: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, context: AggContextParams, ) -> crate::Result { @@ -540,7 +540,7 @@ pub(crate) fn build_aggregations_data_from_req( fn build_nodes( agg_name: &str, req: &Aggregation, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, is_top_level: bool, @@ -787,7 +787,7 @@ fn build_nodes( let idx_in_req_data = data.push_filter_req_data(FilterAggReqData { name: agg_name.to_string(), req: filter_req.clone(), - segment_reader: reader.clone(), + segment_reader: reader.clone_arc(), evaluator, matching_docs_buffer, is_top_level, @@ -804,7 +804,7 @@ fn build_nodes( fn build_composite_node( agg_name: &str, - reader: &SegmentReader, + reader: &dyn SegmentReader, _segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, sub_aggs: &Aggregations, @@ -833,7 +833,7 @@ fn build_composite_node( fn build_children( aggs: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, ) -> crate::Result> { @@ -852,7 +852,7 @@ fn build_children( } fn get_term_agg_accessors( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, missing: &Option, ) -> crate::Result, ColumnType)>> { @@ -905,7 +905,7 @@ fn build_terms_or_cardinality_nodes( agg_name: &str, field_name: &str, missing: &Option, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, sub_aggs: &Aggregations, diff --git a/src/aggregation/bucket/composite/accessors.rs b/src/aggregation/bucket/composite/accessors.rs index 4bcfbed6a..b7ad75136 100644 --- a/src/aggregation/bucket/composite/accessors.rs +++ b/src/aggregation/bucket/composite/accessors.rs @@ -75,7 +75,7 @@ impl CompositeSourceAccessors { /// /// Precomputes some values to make collection faster. pub fn build_for_source( - reader: &SegmentReader, + reader: &dyn SegmentReader, source: &CompositeAggregationSource, // First option is None when no after key was set in the query, the // second option is None when the after key was set but its value for diff --git a/src/aggregation/bucket/filter.rs b/src/aggregation/bucket/filter.rs index 73518238a..2698b8711 100644 --- a/src/aggregation/bucket/filter.rs +++ b/src/aggregation/bucket/filter.rs @@ -1,4 +1,5 @@ use std::fmt::Debug; +use std::sync::Arc; use common::BitSet; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -402,7 +403,7 @@ pub struct FilterAggReqData { /// The filter aggregation pub req: FilterAggregation, /// The segment reader - pub segment_reader: SegmentReader, + pub segment_reader: Arc, /// Document evaluator for the filter query (precomputed BitSet) /// This is built once when the request data is created pub evaluator: DocumentQueryEvaluator, @@ -416,10 +417,9 @@ impl FilterAggReqData { pub(crate) fn get_memory_consumption(&self) -> usize { // Estimate: name + segment reader reference + bitset + buffer capacity self.name.len() - + std::mem::size_of::() - + self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes) - + self.matching_docs_buffer.capacity() * std::mem::size_of::() - + std::mem::size_of::() + + self.evaluator.bitset.get_memory_consumption() + + self.matching_docs_buffer.capacity() * std::mem::size_of::() + + std::mem::size_of::() } } @@ -438,7 +438,7 @@ impl DocumentQueryEvaluator { pub(crate) fn new( query: Box, schema: Schema, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let max_doc = segment_reader.max_doc(); diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index b254b79ee..16e2cbeb3 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -61,7 +61,7 @@ impl TermsAggReqData { + self .allowed_term_ids .as_ref() - .map(|bs| bs.len() / 8) + .map(|bs| bs.get_memory_consumption()) .unwrap_or(0) } } diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 59e9c677d..2449b493c 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -66,7 +66,7 @@ impl Collector for DistributedAggregationCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { AggregationSegmentCollector::from_agg_req_and_reader( &self.agg, @@ -96,7 +96,7 @@ impl Collector for AggregationCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { AggregationSegmentCollector::from_agg_req_and_reader( &self.agg, @@ -145,7 +145,7 @@ impl AggregationSegmentCollector { /// reader. Also includes validation, e.g. checking field types and existence. pub fn from_agg_req_and_reader( agg: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, context: &AggContextParams, ) -> crate::Result { diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index dcd102249..419f7a5d0 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -43,7 +43,7 @@ impl Collector for Count { fn for_segment( &self, _: SegmentOrdinal, - _: &SegmentReader, + _: &dyn SegmentReader, ) -> crate::Result { Ok(SegmentCountCollector::default()) } diff --git a/src/collector/docset_collector.rs b/src/collector/docset_collector.rs index a27a39418..8300d7a19 100644 --- a/src/collector/docset_collector.rs +++ b/src/collector/docset_collector.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use super::{Collector, SegmentCollector}; -use crate::{DocAddress, DocId, Score}; +use crate::{DocAddress, DocId, Score, SegmentReader}; /// Collectors that returns the set of DocAddress that matches the query. /// @@ -15,7 +15,7 @@ impl Collector for DocSetCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - _segment: &crate::SegmentReader, + _segment: &dyn SegmentReader, ) -> crate::Result { Ok(DocSetChildCollector { segment_local_id, diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 6eb2c3ee7..d0bca6e41 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -265,7 +265,7 @@ impl Collector for FacetCollector { fn for_segment( &self, _: SegmentOrdinal, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { let facet_reader = reader.facet_reader(&self.field_name)?; let facet_dict = facet_reader.facet_dict(); diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index b4bada2ff..f00133a10 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -113,7 +113,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().column_opt(&self.field)?; @@ -287,7 +287,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().bytes(&self.field)?; diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index 51105e7b1..e5c6f3f9c 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -6,7 +6,7 @@ use fastdivide::DividerU64; use crate::collector::{Collector, SegmentCollector}; use crate::fastfield::{FastFieldNotAvailableError, FastValue}; use crate::schema::Type; -use crate::{DocId, Score}; +use crate::{DocId, Score, SegmentReader}; /// Histogram builds an histogram of the values of a fastfield for the /// collected DocSet. @@ -110,7 +110,7 @@ impl Collector for HistogramCollector { fn for_segment( &self, _segment_local_id: crate::SegmentOrdinal, - segment: &crate::SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment.fast_fields().u64_lenient(&self.field)?; let (column, _column_type) = column_opt.ok_or_else(|| FastFieldNotAvailableError { diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 0f8360d8d..2d2a8c16b 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -156,7 +156,7 @@ pub trait Collector: Sync + Send { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result; /// Returns true iff the collector requires to compute scores for documents. @@ -174,7 +174,7 @@ pub trait Collector: Sync + Send { &self, weight: &dyn Weight, segment_ord: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result<::Fruit> { let with_scoring = self.requires_scoring(); let mut segment_collector = self.for_segment(segment_ord, reader)?; @@ -186,7 +186,7 @@ pub trait Collector: Sync + Send { pub(crate) fn default_collect_segment_impl( segment_collector: &mut TSegmentCollector, weight: &dyn Weight, - reader: &SegmentReader, + reader: &dyn SegmentReader, with_scoring: bool, ) -> crate::Result<()> { match (reader.alive_bitset(), with_scoring) { @@ -255,7 +255,7 @@ impl Collector for Option { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { Ok(if let Some(inner) = self { let inner_segment_collector = inner.for_segment(segment_local_id, segment)?; @@ -336,7 +336,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let left = self.0.for_segment(segment_local_id, segment)?; let right = self.1.for_segment(segment_local_id, segment)?; @@ -407,7 +407,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let one = self.0.for_segment(segment_local_id, segment)?; let two = self.1.for_segment(segment_local_id, segment)?; @@ -487,7 +487,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let one = self.0.for_segment(segment_local_id, segment)?; let two = self.1.for_segment(segment_local_id, segment)?; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 14779c4a4..5112ce446 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -24,7 +24,7 @@ impl Collector for CollectorWrapper { fn for_segment( &self, segment_local_id: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result> { let child = self.0.for_segment(segment_local_id, reader)?; Ok(Box::new(SegmentCollectorWrapper(child))) @@ -209,7 +209,7 @@ impl Collector for MultiCollector<'_> { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let children = self .collector_wrappers diff --git a/src/collector/sort_key/order.rs b/src/collector/sort_key/order.rs index 3cac357ad..c5df7d978 100644 --- a/src/collector/sort_key/order.rs +++ b/src/collector/sort_key/order.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::schema::{OwnedValue, Schema}; -use crate::{DocId, Order, Score}; +use crate::{DocId, Order, Score, SegmentReader}; fn compare_owned_value(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering { match (lhs, rhs) { @@ -430,7 +430,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let child = self.0.segment_sort_key_computer(segment_reader)?; Ok(SegmentSortKeyComputerWithComparator { @@ -468,7 +468,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let child = self.0.segment_sort_key_computer(segment_reader)?; Ok(SegmentSortKeyComputerWithComparator { diff --git a/src/collector/sort_key/sort_by_bytes.rs b/src/collector/sort_key/sort_by_bytes.rs index f6b10af2a..b6254e0f6 100644 --- a/src/collector/sort_key/sort_by_bytes.rs +++ b/src/collector/sort_key/sort_by_bytes.rs @@ -32,7 +32,7 @@ impl SortKeyComputer for SortByBytes { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn crate::SegmentReader, ) -> crate::Result { let bytes_column_opt = segment_reader.fast_fields().bytes(&self.column_name)?; Ok(ByBytesColumnSegmentSortKeyComputer { bytes_column_opt }) diff --git a/src/collector/sort_key/sort_by_erased_type.rs b/src/collector/sort_key/sort_by_erased_type.rs index 9ff4c2b40..435cc37a2 100644 --- a/src/collector/sort_key/sort_by_erased_type.rs +++ b/src/collector/sort_key/sort_by_erased_type.rs @@ -6,7 +6,7 @@ use crate::collector::sort_key::{ use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::fastfield::FastFieldNotAvailableError; use crate::schema::OwnedValue; -use crate::{DateTime, DocId, Score}; +use crate::{DateTime, DocId, Score, SegmentReader}; /// Sort by the boxed / OwnedValue representation of either a fast field, or of the score. /// @@ -86,7 +86,7 @@ impl SortKeyComputer for SortByErasedType { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let inner: Box = match self { Self::Field(column_name) => { diff --git a/src/collector/sort_key/sort_by_score.rs b/src/collector/sort_key/sort_by_score.rs index a23660e56..ca1f11a02 100644 --- a/src/collector/sort_key/sort_by_score.rs +++ b/src/collector/sort_key/sort_by_score.rs @@ -1,6 +1,6 @@ use crate::collector::sort_key::NaturalComparator; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer}; -use crate::{DocAddress, DocId, Score}; +use crate::{DocAddress, DocId, Score, SegmentReader}; /// Sort by similarity score. #[derive(Clone, Debug, Copy)] @@ -19,7 +19,7 @@ impl SortKeyComputer for SortBySimilarityScore { fn segment_sort_key_computer( &self, - _segment_reader: &crate::SegmentReader, + _segment_reader: &dyn SegmentReader, ) -> crate::Result { Ok(SortBySimilarityScore) } @@ -29,7 +29,7 @@ impl SortKeyComputer for SortBySimilarityScore { &self, k: usize, weight: &dyn crate::query::Weight, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, segment_ord: u32, ) -> crate::Result> { let mut top_n: TopNComputer = diff --git a/src/collector/sort_key/sort_by_static_fast_value.rs b/src/collector/sort_key/sort_by_static_fast_value.rs index 44a4e1d8d..6f2e67a88 100644 --- a/src/collector/sort_key/sort_by_static_fast_value.rs +++ b/src/collector/sort_key/sort_by_static_fast_value.rs @@ -61,7 +61,7 @@ impl SortKeyComputer for SortByStaticFastValue { fn segment_sort_key_computer( &self, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?; let (sort_column, _sort_column_type) = diff --git a/src/collector/sort_key/sort_by_string.rs b/src/collector/sort_key/sort_by_string.rs index 2dd0b4592..05a30c8b1 100644 --- a/src/collector/sort_key/sort_by_string.rs +++ b/src/collector/sort_key/sort_by_string.rs @@ -3,7 +3,7 @@ use columnar::StrColumn; use crate::collector::sort_key::NaturalComparator; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::termdict::TermOrdinal; -use crate::{DocId, Score}; +use crate::{DocId, Score, SegmentReader}; /// Sort by the first value of a string column. /// @@ -35,7 +35,7 @@ impl SortKeyComputer for SortByString { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?; Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt }) diff --git a/src/collector/sort_key/sort_key_computer.rs b/src/collector/sort_key/sort_key_computer.rs index 6aab919a9..35b08dc45 100644 --- a/src/collector/sort_key/sort_key_computer.rs +++ b/src/collector/sort_key/sort_key_computer.rs @@ -119,7 +119,7 @@ pub trait SortKeyComputer: Sync { &self, k: usize, weight: &dyn crate::query::Weight, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, segment_ord: u32, ) -> crate::Result> { let with_scoring = self.requires_scoring(); @@ -135,7 +135,7 @@ pub trait SortKeyComputer: Sync { } /// Builds a child sort key computer for a specific segment. - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result; + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result; } impl SortKeyComputer @@ -156,7 +156,7 @@ where (self.0.comparator(), self.1.comparator()) } - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { Ok(( self.0.segment_sort_key_computer(segment_reader)?, self.1.segment_sort_key_computer(segment_reader)?, @@ -357,7 +357,7 @@ where ) } - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?; let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?; let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?; @@ -420,7 +420,7 @@ where SortKeyComputer4::Comparator, ); - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?; let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?; let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?; @@ -454,7 +454,7 @@ where impl SortKeyComputer for F where - F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF, + F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> SegmentF, SegmentF: 'static + FnMut(DocId) -> TSortKey, TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug, { @@ -462,7 +462,7 @@ where type Child = SegmentF; type Comparator = NaturalComparator; - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { Ok((self)(segment_reader)) } } @@ -509,10 +509,10 @@ mod tests { #[test] fn test_lazy_score_computer() { - let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32; + let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32; let call_count = Arc::new(AtomicUsize::new(0)); let call_count_clone = call_count.clone(); - let score_computer_secondary = move |_segment_reader: &SegmentReader| { + let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| { let call_count_new_clone = call_count_clone.clone(); move |_doc: DocId| { call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst); @@ -572,10 +572,10 @@ mod tests { #[test] fn test_lazy_score_computer_dynamic_ordering() { - let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32; + let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32; let call_count = Arc::new(AtomicUsize::new(0)); let call_count_clone = call_count.clone(); - let score_computer_secondary = move |_segment_reader: &SegmentReader| { + let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| { let call_count_new_clone = call_count_clone.clone(); move |_doc: DocId| { call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst); diff --git a/src/collector/sort_key_top_collector.rs b/src/collector/sort_key_top_collector.rs index 9ca47581b..6995973a2 100644 --- a/src/collector/sort_key_top_collector.rs +++ b/src/collector/sort_key_top_collector.rs @@ -32,7 +32,11 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static self.sort_key_computer.check_schema(schema) } - fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result { + fn for_segment( + &self, + segment_ord: u32, + segment_reader: &dyn SegmentReader, + ) -> Result { let segment_sort_key_computer = self .sort_key_computer .segment_sort_key_computer(segment_reader)?; @@ -63,7 +67,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static &self, weight: &dyn Weight, segment_ord: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result> { let k = self.doc_range.end; let docs = self diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 61b6a595b..ef8068124 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -5,7 +5,7 @@ use crate::query::{AllQuery, QueryParser}; use crate::schema::{Schema, FAST, TEXT}; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; -use crate::{DateTime, DocAddress, Index, Searcher, TantivyDocument}; +use crate::{DateTime, DocAddress, Index, Searcher, SegmentReader, TantivyDocument}; pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector { compute_score: true, @@ -109,7 +109,7 @@ impl Collector for TestCollector { fn for_segment( &self, segment_id: SegmentOrdinal, - _reader: &SegmentReader, + _reader: &dyn SegmentReader, ) -> crate::Result { Ok(TestSegmentCollector { segment_id, @@ -180,7 +180,7 @@ impl Collector for FastFieldTestCollector { fn for_segment( &self, _: SegmentOrdinal, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let reader = segment_reader .fast_fields() @@ -243,7 +243,7 @@ impl Collector for BytesFastFieldTestCollector { fn for_segment( &self, _segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().bytes(&self.field)?; Ok(BytesFastFieldSegmentCollector { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 0ce1c611a..086364853 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -393,7 +393,7 @@ impl TopDocs { /// // This is where we build our collector with our custom score. /// let top_docs_by_custom_score = TopDocs /// ::with_limit(10) - /// .tweak_score(move |segment_reader: &SegmentReader| { + /// .tweak_score(move |segment_reader: &dyn SegmentReader| { /// // The argument is a function that returns our scoring /// // function. /// // @@ -442,7 +442,7 @@ pub struct TweakScoreFn(F); impl SortKeyComputer for TweakScoreFn where - F: 'static + Send + Sync + Fn(&SegmentReader) -> TTweakScoreSortKeyFn, + F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> TTweakScoreSortKeyFn, TTweakScoreSortKeyFn: 'static + Fn(DocId, Score) -> TSortKey, TweakScoreSegmentSortKeyComputer: SegmentSortKeyComputer, @@ -458,7 +458,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { Ok({ TweakScoreSegmentSortKeyComputer { @@ -1525,7 +1525,7 @@ mod tests { let text_query = query_parser.parse_query("droopy tax")?; let collector = TopDocs::with_limit(2) .and_offset(1) - .order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc); + .order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc); let score_docs: Vec<(u32, DocAddress)> = index.reader()?.searcher().search(&text_query, &collector)?; assert_eq!( @@ -1543,7 +1543,7 @@ mod tests { let text_query = query_parser.parse_query("droopy tax").unwrap(); let collector = TopDocs::with_limit(2) .and_offset(1) - .order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc); + .order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc); let score_docs: Vec<(u32, DocAddress)> = index .reader() .unwrap() diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 7f2094e53..8a3ac1e2b 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -4,7 +4,7 @@ use common::{replace_in_place, JsonPathWriter}; use rustc_hash::FxHashMap; use crate::indexer::indexing_term::IndexingTerm; -use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; +use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter as _, PostingsWriterEnum}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; @@ -80,7 +80,7 @@ fn index_json_object<'a, V: Value<'a>>( text_analyzer: &mut TextAnalyzer, term_buffer: &mut IndexingTerm, json_path_writer: &mut JsonPathWriter, - postings_writer: &mut dyn PostingsWriter, + postings_writer: &mut PostingsWriterEnum, ctx: &mut IndexingContext, positions_per_path: &mut IndexingPositionsPerPath, ) { @@ -110,7 +110,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( text_analyzer: &mut TextAnalyzer, term_buffer: &mut IndexingTerm, json_path_writer: &mut JsonPathWriter, - postings_writer: &mut dyn PostingsWriter, + postings_writer: &mut PostingsWriterEnum, ctx: &mut IndexingContext, positions_per_path: &mut IndexingPositionsPerPath, ) { diff --git a/src/core/mod.rs b/src/core/mod.rs index db4ab2896..6e384c16b 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,7 +8,7 @@ use std::path::Path; use once_cell::sync::Lazy; pub use self::executor::Executor; -pub use self::searcher::{Searcher, SearcherGeneration}; +pub use self::searcher::{Searcher, SearcherContext, SearcherGeneration}; /// The meta file contains all the information about the list of segments and the schema /// of the index. diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9603d0f4f..b25760224 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -4,13 +4,13 @@ use std::{fmt, io}; use crate::collector::Collector; use crate::core::Executor; -use crate::index::{SegmentId, SegmentReader}; +use crate::index::{Index, SegmentId, SegmentReader}; use crate::query::{Bm25StatisticsProvider, EnableScoring, Query}; -use crate::schema::document::DocumentDeserialize; -use crate::schema::{Schema, Term}; +use crate::schema::{Field, FieldType, Schema, TantivyDocument, Term}; use crate::space_usage::SearcherSpaceUsage; -use crate::store::{CacheStats, StoreReader}; -use crate::{DocAddress, Index, Opstamp, TrackedObject}; +use crate::store::{CacheStats, StoreReader, DOCSTORE_CACHE_CAPACITY}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::{DocAddress, Inventory, Opstamp, TantivyError, TrackedObject}; /// Identifies the searcher generation accessed by a [`Searcher`]. /// @@ -36,7 +36,7 @@ pub struct SearcherGeneration { impl SearcherGeneration { pub(crate) fn from_segment_readers( - segment_readers: &[SegmentReader], + segment_readers: &[Arc], generation_id: u64, ) -> Self { let mut segment_id_to_del_opstamp = BTreeMap::new(); @@ -61,6 +61,103 @@ impl SearcherGeneration { } } +/// Search-time context required by a [`Searcher`]. +#[derive(Clone)] +pub struct SearcherContext { + schema: Schema, + executor: Executor, + tokenizers: TokenizerManager, + fast_field_tokenizers: TokenizerManager, +} + +impl SearcherContext { + /// Creates a context from explicit search-time components. + pub fn new( + schema: Schema, + executor: Executor, + tokenizers: TokenizerManager, + fast_field_tokenizers: TokenizerManager, + ) -> SearcherContext { + SearcherContext { + schema, + executor, + tokenizers, + fast_field_tokenizers, + } + } + + /// Creates a context from an index. + pub fn from_index(index: &Index) -> SearcherContext { + SearcherContext::new( + index.schema(), + index.search_executor().clone(), + index.tokenizers().clone(), + index.fast_field_tokenizer().clone(), + ) + } + + /// Access the schema associated with this context. + pub fn schema(&self) -> &Schema { + &self.schema + } + + /// Access the executor associated with this context. + pub fn search_executor(&self) -> &Executor { + &self.executor + } + + /// Access the tokenizer manager associated with this context. + pub fn tokenizers(&self) -> &TokenizerManager { + &self.tokenizers + } + + /// Access the fast field tokenizer manager associated with this context. + pub fn fast_field_tokenizer(&self) -> &TokenizerManager { + &self.fast_field_tokenizers + } + + /// Get the tokenizer associated with a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> crate::Result { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let indexing_options_opt = match field_type { + FieldType::JsonObject(options) => options.get_text_indexing_options(), + FieldType::Str(options) => options.get_indexing_options(), + _ => { + return Err(TantivyError::SchemaError(format!( + "{:?} is not a text field.", + field_entry.name() + ))) + } + }; + let indexing_options = indexing_options_opt.ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "No indexing options set for field {field_entry:?}" + )) + })?; + + self.tokenizers + .get(indexing_options.tokenizer()) + .ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "No Tokenizer found for field {field_entry:?}" + )) + }) + } +} + +impl From<&Index> for SearcherContext { + fn from(index: &Index) -> Self { + SearcherContext::from_index(index) + } +} + +impl From for SearcherContext { + fn from(index: Index) -> Self { + SearcherContext::from(&index) + } +} + /// Holds a list of `SegmentReader`s ready for search. /// /// It guarantees that the `Segment` will not be removed before @@ -71,9 +168,66 @@ pub struct Searcher { } impl Searcher { - /// Returns the `Index` associated with the `Searcher` - pub fn index(&self) -> &Index { - &self.inner.index + /// Creates a `Searcher` from an arbitrary list of segment readers. + /// + /// This is useful when segment readers are not opened from + /// `IndexReader` / `meta.json` (e.g. external segment sources). + /// The generated [`SearcherGeneration`] uses `generation_id = 0`. + pub fn from_segment_readers>( + context: Ctx, + segment_readers: Vec>, + ) -> crate::Result { + Self::from_segment_readers_with_generation_id(context, segment_readers, 0) + } + + /// Same as [`Searcher::from_segment_readers`] but allows setting + /// a custom generation id. + pub fn from_segment_readers_with_generation_id>( + context: Ctx, + segment_readers: Vec>, + generation_id: u64, + ) -> crate::Result { + let context = context.into(); + let generation = SearcherGeneration::from_segment_readers(&segment_readers, generation_id); + let tracked_generation = Inventory::default().track(generation); + let inner = SearcherInner::new( + context, + segment_readers, + tracked_generation, + DOCSTORE_CACHE_CAPACITY, + )?; + Ok(Arc::new(inner).into()) + } + + /// Returns the search context associated with the `Searcher`. + pub fn context(&self) -> &SearcherContext { + &self.inner.context + } + + /// Deprecated alias for [`Searcher::context`]. + #[deprecated(note = "use Searcher::context()")] + pub fn index(&self) -> &SearcherContext { + self.context() + } + + /// Access the search executor associated with this searcher. + pub fn search_executor(&self) -> &Executor { + self.context().search_executor() + } + + /// Access the tokenizer manager associated with this searcher. + pub fn tokenizers(&self) -> &TokenizerManager { + self.context().tokenizers() + } + + /// Access the fast field tokenizer manager associated with this searcher. + pub fn fast_field_tokenizer(&self) -> &TokenizerManager { + self.context().fast_field_tokenizer() + } + + /// Get the tokenizer associated with a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> crate::Result { + self.context().tokenizer_for_field(field) } /// [`SearcherGeneration`] which identifies the version of the snapshot held by this `Searcher`. @@ -85,7 +239,7 @@ impl Searcher { /// /// The searcher uses the segment ordinal to route the /// request to the right `Segment`. - pub fn doc(&self, doc_address: DocAddress) -> crate::Result { + pub fn doc(&self, doc_address: DocAddress) -> crate::Result { let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; store_reader.get(doc_address.doc_id) } @@ -105,18 +259,15 @@ impl Searcher { /// Fetches a document in an asynchronous manner. #[cfg(feature = "quickwit")] - pub async fn doc_async( - &self, - doc_address: DocAddress, - ) -> crate::Result { - let executor = self.inner.index.search_executor(); + pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result { + let executor = self.search_executor(); let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; store_reader.get_async(doc_address.doc_id, executor).await } /// Access the schema associated with the index of this searcher. pub fn schema(&self) -> &Schema { - &self.inner.schema + self.context().schema() } /// Returns the overall number of documents in the index. @@ -154,13 +305,13 @@ impl Searcher { } /// Return the list of segment readers - pub fn segment_readers(&self) -> &[SegmentReader] { + pub fn segment_readers(&self) -> &[Arc] { &self.inner.segment_readers } /// Returns the segment_reader associated with the given segment_ord - pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader { - &self.inner.segment_readers[segment_ord as usize] + pub fn segment_reader(&self, segment_ord: u32) -> &dyn SegmentReader { + self.inner.segment_readers[segment_ord as usize].as_ref() } /// Runs a query on the segment readers wrapped by the searcher. @@ -201,7 +352,7 @@ impl Searcher { } else { EnableScoring::disabled_from_searcher(self) }; - let executor = self.inner.index.search_executor(); + let executor = self.search_executor(); self.search_with_executor(query, collector, executor, enabled_scoring) } @@ -229,7 +380,11 @@ impl Searcher { let segment_readers = self.segment_readers(); let fruits = executor.map( |(segment_ord, segment_reader)| { - collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader) + collector.collect_segment( + weight.as_ref(), + segment_ord as u32, + segment_reader.as_ref(), + ) }, segment_readers.iter().enumerate(), )?; @@ -257,19 +412,17 @@ impl From> for Searcher { /// It guarantees that the `Segment` will not be removed before /// the destruction of the `Searcher`. pub(crate) struct SearcherInner { - schema: Schema, - index: Index, - segment_readers: Vec, - store_readers: Vec, + context: SearcherContext, + segment_readers: Vec>, + store_readers: Vec>, generation: TrackedObject, } impl SearcherInner { /// Creates a new `Searcher` pub(crate) fn new( - schema: Schema, - index: Index, - segment_readers: Vec, + context: SearcherContext, + segment_readers: Vec>, generation: TrackedObject, doc_store_cache_num_blocks: usize, ) -> io::Result { @@ -281,14 +434,13 @@ impl SearcherInner { generation.segments(), "Set of segments referenced by this Searcher and its SearcherGeneration must match" ); - let store_readers: Vec = segment_readers + let store_readers: Vec> = segment_readers .iter() .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks)) .collect::>>()?; Ok(SearcherInner { - schema, - index, + context, segment_readers, store_readers, generation, @@ -301,7 +453,7 @@ impl fmt::Debug for Searcher { let segment_ids = self .segment_readers() .iter() - .map(SegmentReader::segment_id) + .map(|segment_reader| segment_reader.segment_id()) .collect::>(); write!(f, "Searcher({segment_ids:?})") } diff --git a/src/core/tests.rs b/src/core/tests.rs index 62baedf1d..d97e65884 100644 --- a/src/core/tests.rs +++ b/src/core/tests.rs @@ -7,8 +7,8 @@ use crate::query::TermQuery; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT}; use crate::tokenizer::TokenizerManager; use crate::{ - Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy, - TantivyDocument, Term, + Directory, DocSet, Executor, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, + ReloadPolicy, Searcher, SearcherContext, TantivyDocument, Term, }; #[test] @@ -300,6 +300,40 @@ fn test_single_segment_index_writer() -> crate::Result<()> { Ok(()) } +#[test] +fn test_searcher_from_external_segment_readers() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let mut writer: IndexWriter = index.writer_for_tests()?; + writer.add_document(doc!(text_field => "hello"))?; + writer.add_document(doc!(text_field => "hello"))?; + writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + let segment_readers = searcher.segment_readers().to_vec(); + let context = SearcherContext::new( + schema, + Executor::single_thread(), + TokenizerManager::default(), + TokenizerManager::default(), + ); + let custom_searcher = + Searcher::from_segment_readers_with_generation_id(context, segment_readers, 42)?; + + let term_query = TermQuery::new( + Term::from_field_text(text_field, "hello"), + IndexRecordOption::Basic, + ); + let count = custom_searcher.search(&term_query, &Count)?; + assert_eq!(count, 2); + assert_eq!(custom_searcher.generation().generation_id(), 42); + assert_eq!(custom_searcher.segment_readers().len(), 1); + Ok(()) +} + #[test] fn test_merging_segment_update_docfreq() { let mut schema_builder = Schema::builder(); diff --git a/src/directory/composite_file.rs b/src/directory/composite_file.rs index 93e063880..6da24a59b 100644 --- a/src/directory/composite_file.rs +++ b/src/directory/composite_file.rs @@ -167,7 +167,9 @@ impl CompositeFile { .map(|byte_range| self.data.slice(byte_range.clone())) } - /// Returns the space usage per field in this composite file. + /// Returns per-field byte usage for all slices stored in this composite file. + /// + /// The provided `schema` is used to resolve field ids into field names. pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { let mut fields = Vec::new(); for (&field_addr, byte_range) in &self.offsets_index { diff --git a/src/docset.rs b/src/docset.rs index 8e72281d2..8b8985004 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -1,4 +1,7 @@ -use std::borrow::{Borrow, BorrowMut}; +use std::borrow::BorrowMut; +use std::ops::{Deref as _, DerefMut as _}; + +use common::BitSet; use crate::fastfield::AliveBitSet; use crate::DocId; @@ -130,6 +133,19 @@ pub trait DocSet: Send { buffer.len() } + /// Fills the given bitset with the documents in the docset. + /// + /// If the docset max_doc is smaller than the largest doc, this function might not consume the + /// docset entirely. + fn fill_bitset(&mut self, bitset: &mut BitSet) { + let bitset_max_value: u32 = bitset.max_value(); + let mut doc = self.doc(); + while doc < bitset_max_value { + bitset.insert(doc); + doc = self.advance(); + } + } + /// Returns the current document /// Right after creating a new `DocSet`, the docset points to the first document. /// @@ -233,51 +249,59 @@ impl DocSet for &mut dyn DocSet { fn count_including_deleted(&mut self) -> u32 { (**self).count_including_deleted() } + + fn fill_bitset(&mut self, bitset: &mut BitSet) { + (**self).fill_bitset(bitset); + } } impl DocSet for Box { + #[inline] fn advance(&mut self) -> DocId { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.advance() + self.deref_mut().advance() } + #[inline] fn seek(&mut self, target: DocId) -> DocId { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.seek(target) + self.deref_mut().seek(target) } + #[inline] fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.seek_danger(target) } + #[inline] fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.fill_buffer(buffer) + self.deref_mut().fill_buffer(buffer) } + #[inline] fn doc(&self) -> DocId { - let unboxed: &TDocSet = self.borrow(); - unboxed.doc() + self.deref().doc() } + #[inline] fn size_hint(&self) -> u32 { - let unboxed: &TDocSet = self.borrow(); - unboxed.size_hint() + self.deref().size_hint() } + #[inline] fn cost(&self) -> u64 { - let unboxed: &TDocSet = self.borrow(); - unboxed.cost() + self.deref().cost() } + #[inline] fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.count(alive_bitset) + self.deref_mut().count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.count_including_deleted() + self.deref_mut().count_including_deleted() + } + + fn fill_bitset(&mut self, bitset: &mut BitSet) { + self.deref_mut().fill_bitset(bitset); } } diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index b93cff20b..96e47f256 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -84,9 +84,7 @@ mod tests { let mut facet = Facet::default(); facet_reader.facet_from_ord(0, &mut facet).unwrap(); assert_eq!(facet.to_path_string(), "/a/b"); - let doc = searcher - .doc::(DocAddress::new(0u32, 0u32)) - .unwrap(); + let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap(); let value = doc .get_first(facet_field) .and_then(|v| v.as_value().as_facet()); @@ -145,7 +143,7 @@ mod tests { let mut facet_ords = Vec::new(); facet_ords.extend(facet_reader.facet_ords(0u32)); assert_eq!(&facet_ords, &[0u64]); - let doc = searcher.doc::(DocAddress::new(0u32, 0u32))?; + let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let value: Option = doc .get_first(facet_field) .and_then(|v| v.as_facet()) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index aca53c212..4478c6864 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -96,7 +96,7 @@ mod tests { }; use crate::time::OffsetDateTime; use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; - use crate::{Index, IndexWriter, SegmentReader}; + use crate::{Index, IndexWriter}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); @@ -430,7 +430,7 @@ mod tests { .searcher() .segment_readers() .iter() - .map(SegmentReader::segment_id) + .map(|segment_reader| segment_reader.segment_id()) .collect(); assert_eq!(segment_ids.len(), 2); index_writer.merge(&segment_ids[..]).wait().unwrap(); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 083f79532..c98683528 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -25,7 +25,8 @@ pub struct FastFieldReaders { } impl FastFieldReaders { - pub(crate) fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result { + /// Opens the segment fast-field container and binds it to a schema. + pub fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result { let columnar = Arc::new(ColumnarReader::open(fast_field_file)?); Ok(FastFieldReaders { columnar, schema }) } @@ -39,7 +40,8 @@ impl FastFieldReaders { self.resolve_column_name_given_default_field(column_name, default_field_opt) } - pub(crate) fn space_usage(&self) -> io::Result { + /// Returns per-field space usage for all loaded fast-field columns. + pub fn space_usage(&self) -> io::Result { let mut per_field_usages: Vec = Default::default(); for (mut field_name, column_handle) in self.columnar.iter_columns()? { json_path_sep_to_dot(&mut field_name); @@ -51,7 +53,8 @@ impl FastFieldReaders { Ok(PerFieldSpaceUsage::new(per_field_usages)) } - pub(crate) fn columnar(&self) -> &ColumnarReader { + /// Returns the underlying `ColumnarReader`. + pub fn columnar(&self) -> &ColumnarReader { self.columnar.as_ref() } diff --git a/src/index/index.rs b/src/index/index.rs index 5495ddced..462c0b8fb 100644 --- a/src/index/index.rs +++ b/src/index/index.rs @@ -3,11 +3,12 @@ use std::fmt; #[cfg(feature = "mmap")] use std::path::Path; use std::path::PathBuf; +use std::sync::Arc; use std::thread::available_parallelism; use super::segment::Segment; use super::segment_reader::merge_field_meta_data; -use super::{FieldMetadata, IndexSettings}; +use super::{FieldMetadata, IndexSettings, TantivySegmentReader}; use crate::core::{Executor, META_FILEPATH}; use crate::directory::error::OpenReadError; #[cfg(feature = "mmap")] @@ -24,7 +25,6 @@ use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::schema::document::Document; use crate::schema::{Field, FieldType, Schema}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; -use crate::SegmentReader; fn load_metas( directory: &dyn Directory, @@ -244,9 +244,12 @@ impl IndexBuilder { /// Creates a new index given an implementation of the trait `Directory`. /// /// If a directory previously existed, it will be erased. - fn create>>(self, dir: T) -> crate::Result { + pub fn create>>(self, dir: T) -> crate::Result { + self.create_avoid_monomorphization(dir.into()) + } + + fn create_avoid_monomorphization(self, dir: Box) -> crate::Result { self.validate()?; - let dir = dir.into(); let directory = ManagedDirectory::wrap(dir)?; save_new_metas( self.get_expect_schema()?, @@ -255,7 +258,7 @@ impl IndexBuilder { )?; let mut metas = IndexMeta::with_schema(self.get_expect_schema()?); metas.index_settings = self.index_settings; - let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default()); + let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default())?; index.set_tokenizers(self.tokenizer_manager); index.set_fast_field_tokenizers(self.fast_field_tokenizer_manager); Ok(index) @@ -381,9 +384,9 @@ impl Index { directory: ManagedDirectory, metas: &IndexMeta, inventory: SegmentMetaInventory, - ) -> Index { + ) -> crate::Result { let schema = metas.schema.clone(); - Index { + Ok(Index { settings: metas.index_settings.clone(), directory, schema, @@ -391,7 +394,7 @@ impl Index { fast_field_tokenizers: TokenizerManager::default(), executor: Executor::single_thread(), inventory, - } + }) } /// Setter for the tokenizer manager. @@ -492,7 +495,16 @@ impl Index { let segments = self.searchable_segments()?; let fields_metadata: Vec> = segments .into_iter() - .map(|segment| SegmentReader::open(&segment)?.fields_metadata()) + .map(|segment| { + let reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let reader: Arc = Arc::new(reader); + reader.fields_metadata() + }) .collect::>()?; Ok(merge_field_meta_data(fields_metadata)) } @@ -512,8 +524,7 @@ impl Index { let directory = ManagedDirectory::wrap(directory)?; let inventory = SegmentMetaInventory::default(); let metas = load_metas(&directory, &inventory)?; - let index = Index::open_from_metas(directory, &metas, inventory); - Ok(index) + Index::open_from_metas(directory, &metas, inventory) } /// Reads the index meta file from the directory. diff --git a/src/index/index_meta.rs b/src/index/index_meta.rs index 8c7983116..bb4d79064 100644 --- a/src/index/index_meta.rs +++ b/src/index/index_meta.rs @@ -287,7 +287,6 @@ pub struct IndexMeta { #[serde(skip_serializing_if = "Option::is_none")] pub payload: Option, } - #[derive(Deserialize, Debug)] struct UntrackedIndexMeta { pub segments: Vec, @@ -379,13 +378,36 @@ mod tests { opstamp: 0u64, payload: None, }; - let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); + let json_value: serde_json::Value = + serde_json::to_value(&index_metas).expect("serialization failed"); assert_eq!( - json, - r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + &json_value, + &serde_json::json!( + { + "index_settings": { + "docstore_compression": "none", + "docstore_blocksize": 16384 + }, + "segments": [], + "schema": [ + { + "name": "text", + "type": "text", + "options": { + "indexing": { + "record": "position", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": false, + "fast": false + } + } + ], + "opstamp": 0 + }) ); - - let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); + let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap(); assert_eq!(index_metas.index_settings, deser_meta.index_settings); assert_eq!(index_metas.schema, deser_meta.schema); assert_eq!(index_metas.opstamp, deser_meta.opstamp); @@ -412,13 +434,37 @@ mod tests { opstamp: 0u64, payload: None, }; - let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); + let json_value = serde_json::to_value(&index_metas).expect("serialization failed"); assert_eq!( - json, - r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + &json_value, + &serde_json::json!( + { + "index_settings": { + "docstore_compression": "zstd(compression_level=4)", + "docstore_blocksize": 1000000 + }, + "segments": [], + "schema": [ + { + "name": "text", + "type": "text", + "options": { + "indexing": { + "record": "position", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": false, + "fast": false + } + } + ], + "opstamp": 0 + } + ) ); - let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); + let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap(); assert_eq!(index_metas.index_settings, deser_meta.index_settings); assert_eq!(index_metas.schema, deser_meta.schema); assert_eq!(index_metas.opstamp, deser_meta.opstamp); diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs index 7314f8741..8a9e38f48 100644 --- a/src/index/inverted_index_reader.rs +++ b/src/index/inverted_index_reader.rs @@ -1,7 +1,12 @@ +use std::any::Any; +#[cfg(feature = "quickwit")] +use std::future::Future; use std::io; +#[cfg(feature = "quickwit")] +use std::pin::Pin; use common::json_path_writer::JSON_END_OF_PATH; -use common::{BinarySerializable, ByteCount}; +use common::{BinarySerializable, BitSet, ByteCount, OwnedBytes}; #[cfg(feature = "quickwit")] use futures_util::{FutureExt, StreamExt, TryStreamExt}; #[cfg(feature = "quickwit")] @@ -10,37 +15,262 @@ use itertools::Itertools; use tantivy_fst::automaton::{AlwaysMatch, Automaton}; use crate::directory::FileSlice; -use crate::positions::PositionReader; -use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; +use crate::docset::DocSet; +use crate::postings::{ + load_postings_from_raw_data, Postings, RawPostingsData, SegmentPostings, TermInfo, +}; use crate::schema::{IndexRecordOption, Term, Type}; use crate::termdict::TermDictionary; +#[cfg(feature = "quickwit")] +pub type TermRangeBounds = (std::ops::Bound, std::ops::Bound); + +/// Trait defining the contract for a dynamically dispatched inverted index reader. +pub trait DynInvertedIndexReader: Send + Sync { + /// Downcasts to the concrete reader type when possible. + fn as_any(&self) -> &dyn Any; + + /// Returns the term info associated with the term. + fn get_term_info(&self, term: &Term) -> io::Result> { + self.terms().get(term.serialized_value_bytes()) + } + + /// Return the term dictionary datastructure. + fn terms(&self) -> &TermDictionary; + + /// Return the fields and types encoded in the dictionary in lexicographic order. + /// Only valid on JSON fields. + /// + /// Notice: This requires a full scan and therefore **very expensive**. + fn list_encoded_json_fields(&self) -> io::Result>; + + /// Returns the raw postings bytes and metadata for a term. + fn read_raw_postings_data( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result; + + /// Returns the total number of tokens recorded for all documents + /// (including deleted documents). + fn total_num_tokens(&self) -> u64; + + /// Returns the segment postings associated with the term, and with the given option, + /// or `None` if the term has never been encountered and indexed. + fn read_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> io::Result>> { + self.get_term_info(term)? + .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) + .transpose() + } + + /// Returns the postings for a given `TermInfo`. + /// + /// The default implementation decodes via [`read_raw_postings_data`]. Custom readers + /// that cannot produce valid raw postings bytes (e.g. merged/union posting sources) + /// should override this method. + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result> { + let postings_data = self.read_raw_postings_data(term_info, option)?; + let postings = load_postings_from_raw_data(term_info.doc_freq, postings_data)?; + Ok(Box::new(postings)) + } + + /// Returns the number of documents containing the term. + fn doc_freq(&self, term: &Term) -> io::Result; + + /// Returns the number of documents containing the term asynchronously. + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + term: &'a Term, + ) -> Pin> + Send + 'a>>; + + /// Warmup fieldnorm readers for this inverted index field. + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>>; + + /// Warmup the block postings for all terms. + /// + /// Default implementation is a no-op. + #[cfg(feature = "quickwit")] + fn warm_postings_full<'a>( + &'a self, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(()) }) + } + + /// Warmup a block postings given a `Term`. + /// + /// Returns whether the term was found in the dictionary. + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + term: &'a Term, + with_positions: bool, + ) -> Pin> + Send + 'a>>; + + /// Warmup block postings for terms in a range. + /// + /// Returns whether at least one matching term was found. + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + terms: TermRangeBounds, + limit: Option, + with_positions: bool, + ) -> Pin> + Send + 'a>>; + + /// Warmup block postings for terms matching an automaton. + /// + /// Returns whether at least one matching term was found. + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + Self: Sized; +} + +/// Trait defining the contract for a typed inverted index reader. +pub trait InvertedIndexReader: Send + Sync { + /// The concrete postings type returned by this reader. + type Postings: Postings; + + /// A lighter doc-id-only iterator returned when frequencies and positions are not needed. + type DocSet: DocSet; + + /// Returns a posting object given a `term_info`. + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result; + + /// Returns a doc-id-only iterator for the given term. + /// + /// Always reads with `IndexRecordOption::Basic` — no frequency decoding, + /// no position reader. + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result; + + /// Fills a bitset with the doc ids for the given term. + fn fill_bitset_from_terminfo( + &self, + term_info: &TermInfo, + doc_bitset: &mut BitSet, + ) -> io::Result<()> { + let mut docset = self.read_docset_from_terminfo(term_info)?; + docset.fill_bitset(doc_bitset); + Ok(()) + } +} + +impl InvertedIndexReader for dyn DynInvertedIndexReader + '_ { + type Postings = Box; + type DocSet = Box; + + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + DynInvertedIndexReader::read_postings_from_terminfo(self, term_info, option) + } + + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result { + DynInvertedIndexReader::read_postings_from_terminfo( + self, + term_info, + IndexRecordOption::Basic, + ) + } +} + +/// Handler interface used by [`try_downcast_and_call`] to build query objects. +pub trait TypedInvertedIndexReaderCb { + /// Invokes the handler with either Tantivy's built-in typed reader or the dynamic fallback. + fn call(&mut self, reader: &I) -> R; +} + +/// Tries Tantivy's built-in reader downcast before falling back to the dynamic reader path. +pub fn try_downcast_and_call(reader: &dyn DynInvertedIndexReader, handler: &mut C) -> R +where C: TypedInvertedIndexReaderCb { + if let Some(reader) = reader.as_any().downcast_ref::() { + return handler.call(reader); + } + handler.call(reader) +} + +struct LoadPostingsFromTermInfo<'a> { + term_info: &'a TermInfo, + option: IndexRecordOption, +} + +impl TypedInvertedIndexReaderCb>> for LoadPostingsFromTermInfo<'_> { + fn call( + &mut self, + reader: &I, + ) -> io::Result> { + let postings = reader.read_postings_from_terminfo(self.term_info, self.option)?; + Ok(Box::new(postings)) + } +} + +pub(crate) fn load_postings_from_terminfo( + reader: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, +) -> io::Result> { + let mut postings_loader = LoadPostingsFromTermInfo { term_info, option }; + try_downcast_and_call(reader, &mut postings_loader) +} + +/// Tantivy's default inverted index reader implementation. +/// /// The inverted index reader is in charge of accessing /// the inverted index associated with a specific field. /// /// # Note /// /// It is safe to delete the segment associated with -/// an `InvertedIndexReader`. As long as it is open, +/// an `InvertedIndexReader` implementation. As long as it is open, /// the [`FileSlice`] it is relying on should /// stay available. /// -/// `InvertedIndexReader` are created by calling +/// `TantivyInvertedIndexReader` instances are created by calling /// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index). -pub struct InvertedIndexReader { +pub struct TantivyInvertedIndexReader { termdict: TermDictionary, postings_file_slice: FileSlice, positions_file_slice: FileSlice, + #[cfg_attr(not(feature = "quickwit"), allow(dead_code))] + fieldnorms_file_slice: FileSlice, record_option: IndexRecordOption, total_num_tokens: u64, } /// Object that records the amount of space used by a field in an inverted index. -pub(crate) struct InvertedIndexFieldSpace { +pub struct InvertedIndexFieldSpace { + /// Field name as encoded in the term dictionary. pub field_name: String, + /// Value type for the encoded field. pub field_type: Type, + /// Total bytes used by postings for this field. pub postings_size: ByteCount, + /// Total bytes used by positions for this field. pub positions_size: ByteCount, + /// Number of terms in the field. pub num_terms: u64, } @@ -62,52 +292,81 @@ impl InvertedIndexFieldSpace { } } -impl InvertedIndexReader { - pub(crate) fn new( +impl TantivyInvertedIndexReader { + pub(crate) fn read_raw_postings_data_inner( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + let effective_option = option.downgrade(self.record_option); + let postings_data = self + .postings_file_slice + .slice(term_info.postings_range.clone()) + .read_bytes()?; + let positions_data: Option = if effective_option.has_positions() { + let positions_data = self + .positions_file_slice + .slice(term_info.positions_range.clone()) + .read_bytes()?; + Some(positions_data) + } else { + None + }; + Ok(RawPostingsData { + postings_data, + positions_data, + record_option: self.record_option, + effective_option, + }) + } + + /// Opens an inverted index reader from already-loaded term/postings/positions slices. + /// + /// The first 8 bytes of `postings_file_slice` are expected to contain + /// the serialized total token count. + pub fn new( termdict: TermDictionary, postings_file_slice: FileSlice, positions_file_slice: FileSlice, + fieldnorms_file_slice: FileSlice, record_option: IndexRecordOption, - ) -> io::Result { + ) -> io::Result { let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8); let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?; - Ok(InvertedIndexReader { + Ok(TantivyInvertedIndexReader { termdict, postings_file_slice: postings_body, positions_file_slice, + fieldnorms_file_slice, record_option, total_num_tokens, }) } - /// Creates an empty `InvertedIndexReader` object, which + /// Creates an empty `TantivyInvertedIndexReader` object, which /// contains no terms at all. - pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader { - InvertedIndexReader { + pub fn empty(record_option: IndexRecordOption) -> TantivyInvertedIndexReader { + TantivyInvertedIndexReader { termdict: TermDictionary::empty(), postings_file_slice: FileSlice::empty(), positions_file_slice: FileSlice::empty(), + fieldnorms_file_slice: FileSlice::empty(), record_option, total_num_tokens: 0u64, } } +} - /// Returns the term info associated with the term. - pub fn get_term_info(&self, term: &Term) -> io::Result> { - self.termdict.get(term.serialized_value_bytes()) +impl DynInvertedIndexReader for TantivyInvertedIndexReader { + fn as_any(&self) -> &dyn Any { + self } - /// Return the term dictionary datastructure. - pub fn terms(&self) -> &TermDictionary { + fn terms(&self) -> &TermDictionary { &self.termdict } - /// Return the fields and types encoded in the dictionary in lexicographic order. - /// Only valid on JSON fields. - /// - /// Notice: This requires a full scan and therefore **very expensive**. - /// TODO: Move to sstable to use the index. - pub(crate) fn list_encoded_json_fields(&self) -> io::Result> { + fn list_encoded_json_fields(&self) -> io::Result> { let mut stream = self.termdict.stream()?; let mut fields: Vec = Vec::new(); @@ -160,136 +419,353 @@ impl InvertedIndexReader { Ok(fields) } - /// Resets the block segment to another position of the postings - /// file. - /// - /// This is useful for enumerating through a list of terms, - /// and consuming the associated posting lists while avoiding - /// reallocating a [`BlockSegmentPostings`]. - /// - /// # Warning - /// - /// This does not reset the positions list. - pub fn reset_block_postings_from_terminfo( - &self, - term_info: &TermInfo, - block_postings: &mut BlockSegmentPostings, - ) -> io::Result<()> { - let postings_slice = self - .postings_file_slice - .slice(term_info.postings_range.clone()); - let postings_bytes = postings_slice.read_bytes()?; - block_postings.reset(term_info.doc_freq, postings_bytes)?; - Ok(()) - } - - /// Returns a block postings given a `Term`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_block_postings( - &self, - term: &Term, - option: IndexRecordOption, - ) -> io::Result> { - self.get_term_info(term)? - .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) - .transpose() - } - - /// Returns a block postings given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_block_postings_from_terminfo( - &self, - term_info: &TermInfo, - requested_option: IndexRecordOption, - ) -> io::Result { - let postings_data = self - .postings_file_slice - .slice(term_info.postings_range.clone()); - BlockSegmentPostings::open( - term_info.doc_freq, - postings_data, - self.record_option, - requested_option, - ) - } - - /// Returns a posting object given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_postings_from_terminfo( + fn read_raw_postings_data( &self, term_info: &TermInfo, option: IndexRecordOption, - ) -> io::Result { - let option = option.downgrade(self.record_option); - - let block_postings = self.read_block_postings_from_terminfo(term_info, option)?; - let position_reader = { - if option.has_positions() { - let positions_data = self - .positions_file_slice - .read_bytes_slice(term_info.positions_range.clone())?; - let position_reader = PositionReader::open(positions_data)?; - Some(position_reader) - } else { - None - } - }; - Ok(SegmentPostings::from_block_postings( - block_postings, - position_reader, - )) + ) -> io::Result { + self.read_raw_postings_data_inner(term_info, option) } - /// Returns the total number of tokens recorded for all documents - /// (including deleted documents). - pub fn total_num_tokens(&self) -> u64 { + fn total_num_tokens(&self) -> u64 { self.total_num_tokens } - /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encountered and indexed. - /// - /// If the field was not indexed with the indexing options that cover - /// the requested options, the returned [`SegmentPostings`] the method does not fail - /// and returns a `SegmentPostings` with as much information as possible. - /// - /// For instance, requesting [`IndexRecordOption::WithFreqs`] for a - /// [`TextOptions`](crate::schema::TextOptions) that does not index position - /// will return a [`SegmentPostings`] with `DocId`s and frequencies. - pub fn read_postings( - &self, - term: &Term, - option: IndexRecordOption, - ) -> io::Result> { - self.get_term_info(term)? - .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) - .transpose() - } - - /// Returns the number of documents containing the term. - pub fn doc_freq(&self, term: &Term) -> io::Result { + fn doc_freq(&self, term: &Term) -> io::Result { Ok(self .get_term_info(term)? .map(|term_info| term_info.doc_freq) .unwrap_or(0u32)) } + + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + term: &'a Term, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + Ok(self + .get_term_info_async(term) + .await? + .map(|term_info| term_info.doc_freq) + .unwrap_or(0u32)) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + self.fieldnorms_file_slice.read_bytes_async().await?; + Ok(()) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_full<'a>( + &'a self, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + self.postings_file_slice.read_bytes_async().await?; + if with_positions { + self.positions_file_slice.read_bytes_async().await?; + } + Ok(()) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + term: &'a Term, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let term_info_opt: Option = self.get_term_info_async(term).await?; + if let Some(term_info) = term_info_opt { + let postings = self + .postings_file_slice + .read_bytes_slice_async(term_info.postings_range.clone()); + if with_positions { + let positions = self + .positions_file_slice + .read_bytes_slice_async(term_info.positions_range.clone()); + futures_util::future::try_join(postings, positions).await?; + } else { + postings.await?; + } + Ok(true) + } else { + Ok(false) + } + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + terms: TermRangeBounds, + limit: Option, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let mut term_info = self + .get_term_range_async(terms, AlwaysMatch, limit, 0) + .await?; + + let Some(first_terminfo) = term_info.next() else { + // no key matches, nothing more to load + return Ok(false); + }; + + let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone()); + + let postings_range = + first_terminfo.postings_range.start..last_terminfo.postings_range.end; + let positions_range = + first_terminfo.positions_range.start..last_terminfo.positions_range.end; + + let postings = self + .postings_file_slice + .read_bytes_slice_async(postings_range); + if with_positions { + let positions = self + .positions_file_slice + .read_bytes_slice_async(positions_range); + futures_util::future::try_join(postings, positions).await?; + } else { + postings.await?; + } + Ok(true) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + Self: Sized, + { + Box::pin(async move { + // merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB + // from S3 (~80MiB/s, and 50ms latency) + const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000; + // Trigger async prefetch of relevant termdict blocks. + let _term_info_iter = self + .get_term_range_async( + (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded), + automaton.clone(), + None, + MERGE_HOLES_UNDER_BYTES, + ) + .await?; + drop(_term_info_iter); + + // Build a 2nd stream without merged holes so we only scan matching blocks. + // This assumes the storage layer caches data fetched by the first pass. + let mut stream = self.termdict.search(automaton).into_stream()?; + let posting_ranges_iter = + std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone())); + let merged_posting_ranges: Vec> = posting_ranges_iter + .coalesce(|range1, range2| { + if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start { + Ok(range1.start..range2.end) + } else { + Err((range1, range2)) + } + }) + .collect(); + + if merged_posting_ranges.is_empty() { + return Ok(false); + } + + let slices_downloaded = futures_util::stream::iter(merged_posting_ranges.into_iter()) + .map(|posting_slice| { + self.postings_file_slice + .read_bytes_slice_async(posting_slice) + .map(|result| result.map(|_slice| ())) + }) + .buffer_unordered(5) + .try_collect::>() + .await?; + + Ok(!slices_downloaded.is_empty()) + }) + } +} + +impl InvertedIndexReader for TantivyInvertedIndexReader { + type Postings = SegmentPostings; + type DocSet = SegmentPostings; + + #[inline] + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + let postings_data = self.read_raw_postings_data_inner(term_info, option)?; + load_postings_from_raw_data(term_info.doc_freq, postings_data) + } + + #[inline] + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result { + let postings_data = + self.read_raw_postings_data_inner(term_info, IndexRecordOption::Basic)?; + load_postings_from_raw_data(term_info.doc_freq, postings_data) + } +} + +#[cfg(test)] +mod tests { + use std::any::TypeId; + + use super::*; + + #[derive(Default)] + struct RecordDispatch { + used_concrete_reader: bool, + used_dynamic_fallback: bool, + } + + impl TypedInvertedIndexReaderCb<()> for RecordDispatch { + fn call(&mut self, _reader: &I) { + let postings_type = TypeId::of::(); + if postings_type == TypeId::of::() { + self.used_concrete_reader = true; + } else if postings_type == TypeId::of::>() { + self.used_dynamic_fallback = true; + } else { + panic!("unexpected postings type in downcast helper test"); + } + } + } + + struct OnlyDynReader { + termdict: TermDictionary, + } + + impl Default for OnlyDynReader { + fn default() -> Self { + Self { + termdict: TermDictionary::empty(), + } + } + } + + impl DynInvertedIndexReader for OnlyDynReader { + fn as_any(&self) -> &dyn Any { + self + } + + fn terms(&self) -> &TermDictionary { + &self.termdict + } + + fn list_encoded_json_fields(&self) -> io::Result> { + Ok(Vec::new()) + } + + fn read_raw_postings_data( + &self, + _term_info: &TermInfo, + _option: IndexRecordOption, + ) -> io::Result { + unreachable!("not used in downcast helper tests") + } + + fn total_num_tokens(&self) -> u64 { + 0 + } + + fn doc_freq(&self, _term: &Term) -> io::Result { + Ok(0) + } + + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + _term: &'a Term, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(0) }) + } + + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(()) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + _term: &'a Term, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(false) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + _terms: TermRangeBounds, + _limit: Option, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(false) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + _automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + { + Box::pin(async { Ok(false) }) + } + } + + #[test] + fn try_downcast_and_call_uses_tantivy_reader() { + let reader = TantivyInvertedIndexReader::empty(IndexRecordOption::Basic); + let mut dispatch_recorder = RecordDispatch::default(); + + try_downcast_and_call(&reader, &mut dispatch_recorder); + + assert!(dispatch_recorder.used_concrete_reader); + assert!(!dispatch_recorder.used_dynamic_fallback); + } + + #[test] + fn try_downcast_and_call_uses_dynamic_fallback_for_other_readers() { + let reader = OnlyDynReader::default(); + let mut dispatch_recorder = RecordDispatch::default(); + + try_downcast_and_call(&reader, &mut dispatch_recorder); + + assert!(!dispatch_recorder.used_concrete_reader); + assert!(dispatch_recorder.used_dynamic_fallback); + } } #[cfg(feature = "quickwit")] -impl InvertedIndexReader { +impl TantivyInvertedIndexReader { pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result> { self.termdict.get_async(term.serialized_value_bytes()).await } async fn get_term_range_async<'a, A: Automaton + 'a>( &'a self, - terms: impl std::ops::RangeBounds, + terms: TermRangeBounds, automaton: A, limit: Option, merge_holes_under_bytes: usize, @@ -297,17 +773,17 @@ impl InvertedIndexReader { where A::State: Clone, { - use std::ops::Bound; let range_builder = self.termdict.search(automaton); - let range_builder = match terms.start_bound() { - Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()), - Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()), - Bound::Unbounded => range_builder, + let (start_bound, end_bound) = terms; + let range_builder = match start_bound { + std::ops::Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()), + std::ops::Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()), + std::ops::Bound::Unbounded => range_builder, }; - let range_builder = match terms.end_bound() { - Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()), - Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()), - Bound::Unbounded => range_builder, + let range_builder = match end_bound { + std::ops::Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()), + std::ops::Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()), + std::ops::Bound::Unbounded => range_builder, }; let range_builder = if let Some(limit) = limit { range_builder.limit(limit) @@ -328,167 +804,4 @@ impl InvertedIndexReader { Ok(iter) } - - /// Warmup a block postings given a `Term`. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether the term was found in the dictionary - pub async fn warm_postings(&self, term: &Term, with_positions: bool) -> io::Result { - let term_info_opt: Option = self.get_term_info_async(term).await?; - if let Some(term_info) = term_info_opt { - let postings = self - .postings_file_slice - .read_bytes_slice_async(term_info.postings_range.clone()); - if with_positions { - let positions = self - .positions_file_slice - .read_bytes_slice_async(term_info.positions_range.clone()); - futures_util::future::try_join(postings, positions).await?; - } else { - postings.await?; - } - Ok(true) - } else { - Ok(false) - } - } - - /// Warmup a block postings given a range of `Term`s. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether a term matching the range was found in the dictionary - pub async fn warm_postings_range( - &self, - terms: impl std::ops::RangeBounds, - limit: Option, - with_positions: bool, - ) -> io::Result { - let mut term_info = self - .get_term_range_async(terms, AlwaysMatch, limit, 0) - .await?; - - let Some(first_terminfo) = term_info.next() else { - // no key matches, nothing more to load - return Ok(false); - }; - - let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone()); - - let postings_range = first_terminfo.postings_range.start..last_terminfo.postings_range.end; - let positions_range = - first_terminfo.positions_range.start..last_terminfo.positions_range.end; - - let postings = self - .postings_file_slice - .read_bytes_slice_async(postings_range); - if with_positions { - let positions = self - .positions_file_slice - .read_bytes_slice_async(positions_range); - futures_util::future::try_join(postings, positions).await?; - } else { - postings.await?; - } - Ok(true) - } - - /// Warmup a block postings given a range of `Term`s. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether a term matching the range was found in the dictionary - pub async fn warm_postings_automaton< - A: Automaton + Clone + Send + 'static, - E: FnOnce(Box io::Result<()> + Send>) -> F, - F: std::future::Future>, - >( - &self, - automaton: A, - // with_positions: bool, at the moment we have no use for it, and supporting it would add - // complexity to the coalesce - executor: E, - ) -> io::Result - where - A::State: Clone, - { - // merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB from - // S3 (~80MiB/s, and 50ms latency) - const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000; - // we build a first iterator to download everything. Simply calling the function already - // download everything we need from the sstable, but doesn't start iterating over it. - let _term_info_iter = self - .get_term_range_async(.., automaton.clone(), None, MERGE_HOLES_UNDER_BYTES) - .await?; - - let (sender, posting_ranges_to_load_stream) = futures_channel::mpsc::unbounded(); - let termdict = self.termdict.clone(); - let cpu_bound_task = move || { - // then we build a 2nd iterator, this one with no holes, so we don't go through blocks - // we can't match. - // This makes the assumption there is a caching layer below us, which gives sync read - // for free after the initial async access. This might not always be true, but is in - // Quickwit. - // We build things from this closure otherwise we get into lifetime issues that can only - // be solved with self referential strucs. Returning an io::Result from here is a bit - // more leaky abstraction-wise, but a lot better than the alternative - let mut stream = termdict.search(automaton).into_stream()?; - - // we could do without an iterator, but this allows us access to coalesce which simplify - // things - let posting_ranges_iter = - std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone())); - - let merged_posting_ranges_iter = posting_ranges_iter.coalesce(|range1, range2| { - if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start { - Ok(range1.start..range2.end) - } else { - Err((range1, range2)) - } - }); - - for posting_range in merged_posting_ranges_iter { - if let Err(_) = sender.unbounded_send(posting_range) { - // this should happen only when search is cancelled - return Err(io::Error::other("failed to send posting range back")); - } - } - Ok(()) - }; - let task_handle = executor(Box::new(cpu_bound_task)); - - let posting_downloader = posting_ranges_to_load_stream - .map(|posting_slice| { - self.postings_file_slice - .read_bytes_slice_async(posting_slice) - .map(|result| result.map(|_slice| ())) - }) - .buffer_unordered(5) - .try_collect::>(); - - let (_, slices_downloaded) = - futures_util::future::try_join(task_handle, posting_downloader).await?; - - Ok(!slices_downloaded.is_empty()) - } - - /// Warmup the block postings for all terms. - /// This method is for an advanced usage only. - /// - /// If you know which terms to pre-load, prefer using [`Self::warm_postings`] or - /// [`Self::warm_postings`] instead. - pub async fn warm_postings_full(&self, with_positions: bool) -> io::Result<()> { - self.postings_file_slice.read_bytes_async().await?; - if with_positions { - self.positions_file_slice.read_bytes_async().await?; - } - Ok(()) - } - - /// Returns the number of documents containing the term asynchronously. - pub async fn doc_freq_async(&self, term: &Term) -> io::Result { - Ok(self - .get_term_info_async(term) - .await? - .map(|term_info| term_info.doc_freq) - .unwrap_or(0u32)) - } } diff --git a/src/index/mod.rs b/src/index/mod.rs index 76dc3ed9b..cab6307da 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -13,8 +13,12 @@ mod segment_reader; pub use self::index::{Index, IndexBuilder}; pub(crate) use self::index_meta::SegmentMetaInventory; pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta}; -pub use self::inverted_index_reader::InvertedIndexReader; +pub(crate) use self::inverted_index_reader::load_postings_from_terminfo; +pub use self::inverted_index_reader::{ + try_downcast_and_call, DynInvertedIndexReader, InvertedIndexFieldSpace, InvertedIndexReader, + TantivyInvertedIndexReader, TypedInvertedIndexReaderCb, +}; pub use self::segment::Segment; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; -pub use self::segment_reader::{FieldMetadata, SegmentReader}; +pub use self::segment_reader::{FieldMetadata, SegmentReader, TantivySegmentReader}; diff --git a/src/index/segment.rs b/src/index/segment.rs index fcd32a1ff..0815e0aec 100644 --- a/src/index/segment.rs +++ b/src/index/segment.rs @@ -16,7 +16,7 @@ pub struct Segment { } impl fmt::Debug for Segment { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Segment({:?})", self.id().uuid_string()) } } diff --git a/src/index/segment_id.rs b/src/index/segment_id.rs index e66aa95a9..46b4e646f 100644 --- a/src/index/segment_id.rs +++ b/src/index/segment_id.rs @@ -44,7 +44,7 @@ fn create_uuid() -> Uuid { } impl SegmentId { - #[doc(hidden)] + /// Generates a new random `SegmentId`. pub fn generate_random() -> SegmentId { SegmentId(create_uuid()) } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index cfccc65ed..f618c53ed 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -6,17 +6,101 @@ use common::{ByteCount, HasLen}; use fnv::FnvHashMap; use itertools::Itertools; -use crate::directory::{CompositeFile, FileSlice}; +use crate::directory::{CompositeFile, Directory, FileSlice}; use crate::error::DataCorruption; use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; -use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; +use crate::index::{ + DynInvertedIndexReader, Segment, SegmentComponent, SegmentId, SegmentMeta, + TantivyInvertedIndexReader, +}; use crate::json_utils::json_path_sep_to_dot; +use crate::postings::SegmentPostings; +use crate::query::boolean_query::block_wand::{block_wand, block_wand_single_scorer}; +use crate::query::term_query::TermScorer; +use crate::query::{BufferedUnionScorer, Scorer, SumCombiner}; use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::space_usage::SegmentSpaceUsage; -use crate::store::StoreReader; +use crate::store::{StoreReader, TantivyStoreReader}; use crate::termdict::TermDictionary; -use crate::{DocId, Opstamp}; +use crate::{DocId, DocSet as _, Opstamp, Score, TERMINATED}; + +/// Trait defining the contract for a segment reader. +pub trait SegmentReader: Send + Sync { + /// Returns the highest document id ever attributed in this segment + 1. + fn max_doc(&self) -> DocId; + + /// Returns the number of alive documents. Deleted documents are not counted. + fn num_docs(&self) -> DocId; + + /// Returns the schema of the index this segment belongs to. + fn schema(&self) -> &Schema; + + /// Performs a for_each_pruning operation on the given scorer. + fn for_each_pruning( + &self, + threshold: Score, + scorer: Box, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ); + + /// Return the number of documents that have been deleted in the segment. + fn num_deleted_docs(&self) -> DocId; + + /// Returns true if some of the documents of the segment have been deleted. + fn has_deletes(&self) -> bool; + + /// Accessor to a segment's fast field reader given a field. + fn fast_fields(&self) -> &FastFieldReaders; + + /// Accessor to the `FacetReader` associated with a given `Field`. + fn facet_reader(&self, field_name: &str) -> crate::Result { + let field = self.schema().get_field(field_name)?; + let field_entry = self.schema().get_field_entry(field); + if field_entry.field_type().value_type() != Type::Facet { + return Err(crate::TantivyError::SchemaError(format!( + "`{field_name}` is not a facet field.`" + ))); + } + let Some(facet_column) = self.fast_fields().str(field_name)? else { + panic!("Facet Field `{field_name}` is missing. This should not happen"); + }; + Ok(FacetReader::new(facet_column)) + } + + /// Accessor to the segment's `Field norms`'s reader. + fn get_fieldnorms_reader(&self, field: Field) -> crate::Result; + + /// Accessor to the segment's [`StoreReader`](crate::store::StoreReader). + fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result>; + + /// Returns a field reader associated with the field given in argument. + fn inverted_index(&self, field: Field) -> crate::Result>; + + /// Returns the list of fields that have been indexed in the segment. + fn fields_metadata(&self) -> crate::Result>; + + /// Returns the segment id. + fn segment_id(&self) -> SegmentId; + + /// Returns the delete opstamp. + fn delete_opstamp(&self) -> Option; + + /// Returns the bitset representing the alive `DocId`s. + fn alive_bitset(&self) -> Option<&AliveBitSet>; + + /// Returns true if the `doc` is marked as deleted. + fn is_deleted(&self, doc: DocId) -> bool; + + /// Returns an iterator that will iterate over the alive document ids. + fn doc_ids_alive(&self) -> Box + Send + '_>; + + /// Summarize total space usage of this segment. + fn space_usage(&self) -> io::Result; + + /// Clones this reader into a shared trait object. + fn clone_arc(&self) -> Arc; +} /// Entry point to access all of the datastructures of the `Segment` /// @@ -29,8 +113,8 @@ use crate::{DocId, Opstamp}; /// The segment reader has a very low memory footprint, /// as close to all of the memory data is mmapped. #[derive(Clone)] -pub struct SegmentReader { - inv_idx_reader_cache: Arc>>>, +pub struct TantivySegmentReader { + inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, delete_opstamp: Option, @@ -49,73 +133,157 @@ pub struct SegmentReader { schema: Schema, } -impl SegmentReader { - /// Returns the highest document id ever attributed in - /// this segment + 1. - pub fn max_doc(&self) -> DocId { +impl TantivySegmentReader { + /// Open a new segment for reading. + pub fn open(segment: &Segment) -> crate::Result> { + Self::open_with_custom_alive_set(segment, None) + } + + /// Open a new segment for reading. + pub fn open_with_custom_alive_set( + segment: &Segment, + custom_bitset: Option, + ) -> crate::Result> { + let reader = Self::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + custom_bitset, + )?; + Ok(Arc::new(reader)) + } + + pub(crate) fn open_with_custom_alive_set_from_directory( + directory: &dyn Directory, + segment_meta: &SegmentMeta, + schema: Schema, + custom_bitset: Option, + ) -> crate::Result { + let termdict_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Terms))?; + let termdict_composite = CompositeFile::open(&termdict_file)?; + + let store_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Store))?; + + crate::fail_point!("SegmentReader::open#middle"); + + let postings_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Postings))?; + let postings_composite = CompositeFile::open(&postings_file)?; + + let positions_composite = { + if let Ok(positions_file) = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Positions)) + { + CompositeFile::open(&positions_file)? + } else { + CompositeFile::empty() + } + }; + + let fast_fields_data = + directory.open_read(&segment_meta.relative_path(SegmentComponent::FastFields))?; + let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; + let fieldnorm_data = + directory.open_read(&segment_meta.relative_path(SegmentComponent::FieldNorms))?; + let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; + + let original_bitset = if segment_meta.has_deletes() { + let alive_doc_file_slice = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Delete))?; + let alive_doc_data = alive_doc_file_slice.read_bytes()?; + Some(AliveBitSet::open(alive_doc_data)) + } else { + None + }; + + let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset); + + let max_doc = segment_meta.max_doc(); + let num_docs = alive_bitset_opt + .as_ref() + .map(|alive_bitset| alive_bitset.num_alive_docs() as u32) + .unwrap_or(max_doc); + + Ok(TantivySegmentReader { + inv_idx_reader_cache: Default::default(), + num_docs, + max_doc, + termdict_composite, + postings_composite, + fast_fields_readers, + fieldnorm_readers, + segment_id: segment_meta.id(), + delete_opstamp: segment_meta.delete_opstamp(), + store_file, + alive_bitset_opt, + positions_composite, + schema, + }) + } +} + +impl SegmentReader for TantivySegmentReader { + fn max_doc(&self) -> DocId { self.max_doc } - /// Returns the number of alive documents. - /// Deleted documents are not counted. - pub fn num_docs(&self) -> DocId { + fn num_docs(&self) -> DocId { self.num_docs } - /// Returns the schema of the index this segment belongs to. - pub fn schema(&self) -> &Schema { + fn schema(&self) -> &Schema { &self.schema } - /// Return the number of documents that have been - /// deleted in the segment. - pub fn num_deleted_docs(&self) -> DocId { + fn for_each_pruning( + &self, + mut threshold: Score, + mut scorer: Box, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + // Try WAND acceleration with concrete postings types + scorer = match scorer.downcast::>() { + Ok(term_scorer) => { + block_wand_single_scorer(*term_scorer, threshold, callback); + return; + } + Err(scorer) => scorer, + }; + match scorer.downcast::, SumCombiner>>() { + Ok(mut union_scorer) => { + let doc = union_scorer.doc(); + if doc == TERMINATED { + return; + } + let score = union_scorer.score(); + if score > threshold { + threshold = callback(doc, score); + } + let scorers: Vec> = union_scorer.into_scorers(); + block_wand(scorers, threshold, callback); + } + Err(mut scorer) => { + // No acceleration available. Fall back to default. + scorer.for_each_pruning(threshold, callback); + } + } + } + + fn num_deleted_docs(&self) -> DocId { self.max_doc - self.num_docs } - /// Returns true if some of the documents of the segment have been deleted. - pub fn has_deletes(&self) -> bool { - self.num_deleted_docs() > 0 + fn has_deletes(&self) -> bool { + self.num_docs != self.max_doc } - /// Accessor to a segment's fast field reader given a field. - /// - /// Returns the u64 fast value reader if the field - /// is a u64 field indexed as "fast". - /// - /// Return a FastFieldNotAvailableError if the field is not - /// declared as a fast field in the schema. - /// - /// # Panics - /// May panic if the index is corrupted. - pub fn fast_fields(&self) -> &FastFieldReaders { + fn fast_fields(&self) -> &FastFieldReaders { &self.fast_fields_readers } - /// Accessor to the `FacetReader` associated with a given `Field`. - pub fn facet_reader(&self, field_name: &str) -> crate::Result { - let schema = self.schema(); - let field = schema.get_field(field_name)?; - let field_entry = schema.get_field_entry(field); - if field_entry.field_type().value_type() != Type::Facet { - return Err(crate::TantivyError::SchemaError(format!( - "`{field_name}` is not a facet field.`" - ))); - } - let Some(facet_column) = self.fast_fields().str(field_name)? else { - panic!("Facet Field `{field_name}` is missing. This should not happen"); - }; - Ok(FacetReader::new(facet_column)) - } - - /// Accessor to the segment's `Field norms`'s reader. - /// - /// Field norms are the length (in tokens) of the fields. - /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). - /// - /// They are simply stored as a fast field, serialized in - /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result { + fn get_fieldnorms_reader(&self, field: Field) -> crate::Result { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( @@ -126,100 +294,14 @@ impl SegmentReader { }) } - #[doc(hidden)] - pub fn fieldnorms_readers(&self) -> &FieldNormReaders { - &self.fieldnorm_readers + fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result> { + Ok(Box::new(TantivyStoreReader::open( + self.store_file.clone(), + cache_num_blocks, + )?)) } - /// Accessor to the segment's [`StoreReader`](crate::store::StoreReader). - /// - /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU. - /// The size of blocks is configurable, this should be reflexted in the - pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result { - StoreReader::open(self.store_file.clone(), cache_num_blocks) - } - - /// Open a new segment for reading. - pub fn open(segment: &Segment) -> crate::Result { - Self::open_with_custom_alive_set(segment, None) - } - - /// Open a new segment for reading. - pub fn open_with_custom_alive_set( - segment: &Segment, - custom_bitset: Option, - ) -> crate::Result { - let termdict_file = segment.open_read(SegmentComponent::Terms)?; - let termdict_composite = CompositeFile::open(&termdict_file)?; - - let store_file = segment.open_read(SegmentComponent::Store)?; - - crate::fail_point!("SegmentReader::open#middle"); - - let postings_file = segment.open_read(SegmentComponent::Postings)?; - let postings_composite = CompositeFile::open(&postings_file)?; - - let positions_composite = { - if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) { - CompositeFile::open(&positions_file)? - } else { - CompositeFile::empty() - } - }; - - let schema = segment.schema(); - - let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?; - let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; - let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; - let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; - - let original_bitset = if segment.meta().has_deletes() { - let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; - let alive_doc_data = alive_doc_file_slice.read_bytes()?; - Some(AliveBitSet::open(alive_doc_data)) - } else { - None - }; - - let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset); - - let max_doc = segment.meta().max_doc(); - let num_docs = alive_bitset_opt - .as_ref() - .map(|alive_bitset| alive_bitset.num_alive_docs() as u32) - .unwrap_or(max_doc); - - Ok(SegmentReader { - inv_idx_reader_cache: Default::default(), - num_docs, - max_doc, - termdict_composite, - postings_composite, - fast_fields_readers, - fieldnorm_readers, - segment_id: segment.id(), - delete_opstamp: segment.meta().delete_opstamp(), - store_file, - alive_bitset_opt, - positions_composite, - schema, - }) - } - - /// Returns a field reader associated with the field given in argument. - /// If the field was not present in the index during indexing time, - /// the InvertedIndexReader is empty. - /// - /// The field reader is in charge of iterating through the - /// term dictionary associated with a specific field, - /// and opening the posting list associated with any term. - /// - /// If the field is not marked as index, a warning is logged and an empty `InvertedIndexReader` - /// is returned. - /// Similarly, if the field is marked as indexed but no term has been indexed for the given - /// index, an empty `InvertedIndexReader` is returned (but no warning is logged). - pub fn inverted_index(&self, field: Field) -> crate::Result> { + fn inverted_index(&self, field: Field) -> crate::Result> { if let Some(inv_idx_reader) = self .inv_idx_reader_cache .read() @@ -244,7 +326,9 @@ impl SegmentReader { // // Returns an empty inverted index. let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic); - return Ok(Arc::new(InvertedIndexReader::empty(record_option))); + let inv_idx_reader: Arc = + Arc::new(TantivyInvertedIndexReader::empty(record_option)); + return Ok(inv_idx_reader); } let record_option = record_option_opt.unwrap(); @@ -267,13 +351,20 @@ impl SegmentReader { ); DataCorruption::comment_only(error_msg) })?; + let fieldnorms_file = self + .fieldnorm_readers + .get_inner_file() + .open_read(field) + .unwrap_or_else(FileSlice::empty); - let inv_idx_reader = Arc::new(InvertedIndexReader::new( - TermDictionary::open(termdict_file)?, - postings_file, - positions_file, - record_option, - )?); + let inv_idx_reader: Arc = + Arc::new(TantivyInvertedIndexReader::new( + TermDictionary::open(termdict_file)?, + postings_file, + positions_file, + fieldnorms_file, + record_option, + )?); // by releasing the lock in between, we may end up opening the inverting index // twice, but this is fine. @@ -285,23 +376,10 @@ impl SegmentReader { Ok(inv_idx_reader) } - /// Returns the list of fields that have been indexed in the segment. - /// The field list includes the field defined in the schema as well as the fields - /// that have been indexed as a part of a JSON field. - /// The returned field name is the full field name, including the name of the JSON field. - /// - /// The returned field names can be used in queries. - /// - /// Notice: If your data contains JSON fields this is **very expensive**, as it requires - /// browsing through the inverted index term dictionary and the columnar field dictionary. - /// - /// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json - /// field that is not indexed nor a fast field but is stored, it is possible for the field - /// to not be listed. - pub fn fields_metadata(&self) -> crate::Result> { + fn fields_metadata(&self) -> crate::Result> { let mut indexed_fields: Vec = Vec::new(); let mut map_to_canonical = FnvHashMap::default(); - for (field, field_entry) in self.schema().fields() { + for (field, field_entry) in self.schema.fields() { let field_name = field_entry.name().to_string(); let is_indexed = field_entry.is_indexed(); if is_indexed { @@ -391,7 +469,7 @@ impl SegmentReader { } } let fast_fields: Vec = self - .fast_fields() + .fast_fields_readers .columnar() .iter_columns()? .map(|(mut field_name, handle)| { @@ -419,31 +497,26 @@ impl SegmentReader { Ok(merged_field_metadatas) } - /// Returns the segment id - pub fn segment_id(&self) -> SegmentId { + fn segment_id(&self) -> SegmentId { self.segment_id } - /// Returns the delete opstamp - pub fn delete_opstamp(&self) -> Option { + fn delete_opstamp(&self) -> Option { self.delete_opstamp } - /// Returns the bitset representing the alive `DocId`s. - pub fn alive_bitset(&self) -> Option<&AliveBitSet> { + fn alive_bitset(&self) -> Option<&AliveBitSet> { self.alive_bitset_opt.as_ref() } - /// Returns true if the `doc` is marked - /// as deleted. - pub fn is_deleted(&self, doc: DocId) -> bool { - self.alive_bitset() + fn is_deleted(&self, doc: DocId) -> bool { + self.alive_bitset_opt + .as_ref() .map(|alive_bitset| alive_bitset.is_deleted(doc)) .unwrap_or(false) } - /// Returns an iterator that will iterate over the alive document ids - pub fn doc_ids_alive(&self) -> Box + Send + '_> { + fn doc_ids_alive(&self) -> Box + Send + '_> { if let Some(alive_bitset) = &self.alive_bitset_opt { Box::new(alive_bitset.iter_alive()) } else { @@ -451,22 +524,25 @@ impl SegmentReader { } } - /// Summarize total space usage of this segment. - pub fn space_usage(&self) -> io::Result { + fn space_usage(&self) -> io::Result { Ok(SegmentSpaceUsage::new( - self.num_docs(), - self.termdict_composite.space_usage(self.schema()), - self.postings_composite.space_usage(self.schema()), - self.positions_composite.space_usage(self.schema()), + self.num_docs, + self.termdict_composite.space_usage(&self.schema), + self.postings_composite.space_usage(&self.schema), + self.positions_composite.space_usage(&self.schema), self.fast_fields_readers.space_usage()?, - self.fieldnorm_readers.space_usage(self.schema()), - self.get_store_reader(0)?.space_usage(), + self.fieldnorm_readers.space_usage(&self.schema), + TantivyStoreReader::open(self.store_file.clone(), 0)?.space_usage(), self.alive_bitset_opt .as_ref() .map(AliveBitSet::space_usage) .unwrap_or_default(), )) } + + fn clone_arc(&self) -> Arc { + Arc::new(self.clone()) + } } #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -576,7 +652,7 @@ fn intersect_alive_bitset( } } -impl fmt::Debug for SegmentReader { +impl fmt::Debug for TantivySegmentReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "SegmentReader({:?})", self.segment_id) } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 1a269caed..9a455f9b1 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -250,11 +250,15 @@ mod tests { struct DummyWeight; impl Weight for DummyWeight { - fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result> { + fn scorer( + &self, + _reader: &dyn SegmentReader, + _boost: Score, + ) -> crate::Result> { Err(crate::TantivyError::InternalError("dummy impl".to_owned())) } - fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result { + fn explain(&self, _reader: &dyn SegmentReader, _doc: DocId) -> crate::Result { Err(crate::TantivyError::InternalError("dummy impl".to_owned())) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 7ffc38615..d07ff1eb4 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -12,7 +12,9 @@ use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit}; use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite}; use crate::error::TantivyError; use crate::fastfield::write_alive_bitset; -use crate::index::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader}; +use crate::index::{ + Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader, TantivySegmentReader, +}; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::index_writer_status::IndexWriterStatus; @@ -94,7 +96,7 @@ pub struct IndexWriter { fn compute_deleted_bitset( alive_bitset: &mut BitSet, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, delete_cursor: &mut DeleteCursor, doc_opstamps: &DocToOpstampMapping, target_opstamp: Opstamp, @@ -143,7 +145,13 @@ pub fn advance_deletes( return Ok(()); } - let segment_reader = SegmentReader::open(&segment)?; + let segment_reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let segment_reader: Arc = Arc::new(segment_reader); let max_doc = segment_reader.max_doc(); let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { @@ -155,7 +163,7 @@ pub fn advance_deletes( compute_deleted_bitset( &mut alive_bitset, - &segment_reader, + segment_reader.as_ref(), segment_entry.delete_cursor(), &DocToOpstampMapping::None, target_opstamp, @@ -243,14 +251,20 @@ fn apply_deletes( .max() .expect("Empty DocOpstamp is forbidden"); - let segment_reader = SegmentReader::open(segment)?; + let segment_reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let segment_reader: Arc = Arc::new(segment_reader); let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let max_doc = segment.meta().max_doc(); let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, - &segment_reader, + segment_reader.as_ref(), delete_cursor, &doc_to_opstamps, max_doc_opstamp, @@ -1965,9 +1979,9 @@ mod tests { .get_store_reader(DOCSTORE_CACHE_CAPACITY) .unwrap(); // test store iterator - for doc in store_reader.iter::(segment_reader.alive_bitset()) { + for doc_id in segment_reader.doc_ids_alive() { + let doc = store_reader.get(doc_id).unwrap(); let id = doc - .unwrap() .get_first(id_field) .unwrap() .as_value() @@ -1978,7 +1992,7 @@ mod tests { // test store random access for doc_id in segment_reader.doc_ids_alive() { let id = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(id_field) .unwrap() @@ -1987,7 +2001,7 @@ mod tests { assert!(expected_ids_and_num_occurrences.contains_key(&id)); if id_is_full_doc(id) { let id2 = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(multi_numbers) .unwrap() @@ -1995,13 +2009,13 @@ mod tests { .unwrap(); assert_eq!(id, id2); let bool = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(bool_field) .unwrap() .as_bool() .unwrap(); - let doc = store_reader.get::(doc_id).unwrap(); + let doc = store_reader.get(doc_id).unwrap(); let mut bool2 = doc.get_all(multi_bools); assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap()); diff --git a/src/indexer/merge_index_test.rs b/src/indexer/merge_index_test.rs index 43f80a9d0..a1aaad58f 100644 --- a/src/indexer/merge_index_test.rs +++ b/src/indexer/merge_index_test.rs @@ -3,7 +3,7 @@ mod tests { use crate::collector::TopDocs; use crate::fastfield::AliveBitSet; use crate::index::Index; - use crate::postings::Postings; + use crate::postings::{DocFreq, Postings}; use crate::query::QueryParser; use crate::schema::{ self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, @@ -121,21 +121,32 @@ mod tests { let my_text_field = index.schema().get_field("text_field").unwrap(); let term_a = Term::from_field_text(my_text_field, "text"); let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); - let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) - .unwrap() - .unwrap(); - assert_eq!(postings.doc_freq(), 2); + let term_info = inverted_index.get_term_info(&term_a).unwrap().unwrap(); + let postings_for_test = crate::index::load_postings_from_terminfo( + inverted_index.as_ref(), + &term_info, + IndexRecordOption::WithFreqsAndPositions, + ) + .unwrap(); let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); assert_eq!( - postings.doc_freq_given_deletes( + crate::indexer::merger::doc_freq_given_deletes( + postings_for_test, segment_reader.alive_bitset().unwrap_or(&fallback_bitset) ), 2 ); + let postings = inverted_index + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap(); + assert_eq!(postings.unwrap().doc_freq(), DocFreq::Exact(2)); + let postings = inverted_index + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap(); + let mut postings = postings.unwrap(); assert_eq!(postings.term_freq(), 1); - let mut output = vec![]; + let mut output = Vec::new(); postings.positions(&mut output); assert_eq!(output, vec![1]); postings.advance(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 47ac5a55b..31912a4c2 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,3 +1,4 @@ +use std::io; use std::sync::Arc; use columnar::{ @@ -15,11 +16,11 @@ use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, use crate::index::{Segment, SegmentComponent, SegmentReader}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::SegmentSerializer; -use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; -use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; +use crate::postings::{InvertedIndexSerializer, Postings, TermInfo}; +use crate::schema::{value_type_to_column_type, Field, FieldType, IndexRecordOption, Schema}; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; -use crate::{DocAddress, DocId, InvertedIndexReader}; +use crate::{DocAddress, DocId, DynInvertedIndexReader}; /// Segment's max doc must be `< MAX_DOC_LIMIT`. /// @@ -27,7 +28,7 @@ use crate::{DocAddress, DocId, InvertedIndexReader}; pub const MAX_DOC_LIMIT: u32 = 1 << 31; fn estimate_total_num_tokens_in_single_segment( - reader: &SegmentReader, + reader: &dyn SegmentReader, field: Field, ) -> crate::Result { // There are no deletes. We can simply use the exact value saved into the posting list. @@ -39,7 +40,7 @@ fn estimate_total_num_tokens_in_single_segment( // When there are deletes, we use an approximation either // by using the fieldnorm. - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { let mut count: [usize; 256] = [0; 256]; for doc in reader.doc_ids_alive() { let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); @@ -68,17 +69,20 @@ fn estimate_total_num_tokens_in_single_segment( Ok((segment_num_tokens as f64 * ratio) as u64) } -fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { +fn estimate_total_num_tokens( + readers: &[Arc], + field: Field, +) -> crate::Result { let mut total_num_tokens: u64 = 0; for reader in readers { - total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?; + total_num_tokens += estimate_total_num_tokens_in_single_segment(reader.as_ref(), field)?; } Ok(total_num_tokens) } pub struct IndexMerger { schema: Schema, - pub(crate) readers: Vec, + pub(crate) readers: Vec>, max_doc: u32, } @@ -162,16 +166,25 @@ impl IndexMerger { // This can be used to merge but also apply an additional filter. // One use case is demux, which is basically taking a list of // segments and partitions them e.g. by a value in a field. + // + // # Panics if segments is empty. pub fn open_with_custom_alive_set( schema: Schema, segments: &[Segment], alive_bitset_opt: Vec>, ) -> crate::Result { + assert!(!segments.is_empty()); let mut readers = vec![]; for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) { if segment.meta().num_docs() > 0 { let reader = - SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?; + crate::TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + new_alive_bitset_opt, + )?; + let reader: Arc = Arc::new(reader); readers.push(reader); } } @@ -262,7 +275,7 @@ impl IndexMerger { }), ); - let has_deletes: bool = self.readers.iter().any(SegmentReader::has_deletes); + let has_deletes: bool = self.readers.iter().any(|reader| reader.has_deletes()); let mapping_type = if has_deletes { MappingType::StackedWithDeletes } else { @@ -297,7 +310,7 @@ impl IndexMerger { let mut max_term_ords: Vec = Vec::new(); - let field_readers: Vec> = self + let field_readers: Vec> = self .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) @@ -355,7 +368,8 @@ impl IndexMerger { indexed. Have you modified the schema?", ); - let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![]; + let mut segment_postings_containing_the_term: Vec<(usize, Box)> = + Vec::with_capacity(self.readers.len()); while merged_terms.advance() { segment_postings_containing_the_term.clear(); @@ -366,18 +380,15 @@ impl IndexMerger { // Let's compute the list of non-empty posting lists for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() { let segment_reader = &self.readers[segment_ord]; - let inverted_index: &InvertedIndexReader = &field_readers[segment_ord]; - let segment_postings = inverted_index - .read_postings_from_terminfo(&term_info, segment_postings_option)?; - let alive_bitset_opt = segment_reader.alive_bitset(); - let doc_freq = if let Some(alive_bitset) = alive_bitset_opt { - segment_postings.doc_freq_given_deletes(alive_bitset) - } else { - segment_postings.doc_freq() - }; - if doc_freq > 0u32 { + let inverted_index = &field_readers[segment_ord]; + if let Some((doc_freq, postings)) = postings_for_merge( + inverted_index.as_ref(), + &term_info, + segment_postings_option, + segment_reader.alive_bitset(), + )? { total_doc_freq += doc_freq; - segment_postings_containing_the_term.push((segment_ord, segment_postings)); + segment_postings_containing_the_term.push((segment_ord, postings)); } } @@ -395,11 +406,7 @@ impl IndexMerger { assert!(!segment_postings_containing_the_term.is_empty()); let has_term_freq = { - let has_term_freq = !segment_postings_containing_the_term[0] - .1 - .block_cursor - .freqs() - .is_empty(); + let has_term_freq = segment_postings_containing_the_term[0].1.has_freq(); for (_, postings) in &segment_postings_containing_the_term[1..] { // This may look at a strange way to test whether we have term freq or not. // With JSON object, the schema is not sufficient to know whether a term @@ -415,7 +422,7 @@ impl IndexMerger { // // Overall the reliable way to know if we have actual frequencies loaded or not // is to check whether the actual decoded array is empty or not. - if has_term_freq == postings.block_cursor.freqs().is_empty() { + if postings.has_freq() != has_term_freq { return Err(DataCorruption::comment_only( "Term freqs are inconsistent across segments", ) @@ -490,33 +497,7 @@ impl IndexMerger { debug_time!("write-storable-fields"); debug!("write-storable-field"); - for reader in &self.readers { - let store_reader = reader.get_store_reader(1)?; - if reader.has_deletes() - // If there is not enough data in the store, we avoid stacking in order to - // avoid creating many small blocks in the doc store. Once we have 5 full blocks, - // we start stacking. In the worst case 2/7 of the blocks would be very small. - // [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}] - // => 5 * full blocks, 2 * 1 document blocks - // - // In a more realistic scenario the segments are of the same size, so 1/6 of - // the doc stores would be on average half full, given total randomness (which - // is not the case here, but not sure how it behaves exactly). - // - // https://github.com/quickwit-oss/tantivy/issues/1053 - // - // take 7 in order to not walk over all checkpoints. - || store_reader.block_checkpoints().take(7).count() < 6 - || store_reader.decompressor() != store_writer.compressor().into() - { - for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { - let doc_bytes = doc_bytes_res?; - store_writer.store_bytes(&doc_bytes)?; - } - } else { - store_writer.stack(store_reader)?; - } - } + store_writer.merge_segment_readers(&self.readers)?; Ok(()) } @@ -553,6 +534,75 @@ impl IndexMerger { } } +/// Compute the number of non-deleted documents. +/// +/// This method will scan through the posting lists, consuming them. +/// (this is a rather expensive operation). +pub(crate) fn doc_freq_given_deletes( + mut postings: Box, + alive_bitset: &AliveBitSet, +) -> u32 { + let mut doc_freq = 0; + loop { + let doc = postings.doc(); + if doc == TERMINATED { + return doc_freq; + } + if alive_bitset.is_alive(doc) { + doc_freq += 1u32; + } + postings.advance(); + } +} + +fn read_postings_for_merge( + inverted_index: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, +) -> io::Result> { + crate::index::load_postings_from_terminfo(inverted_index, term_info, option) +} + +fn postings_for_merge( + inverted_index: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, + alive_bitset_opt: Option<&AliveBitSet>, +) -> io::Result)>> { + // TODO: avoid loading postings twice — once for counting, once for writing + let count_postings = read_postings_for_merge(inverted_index, term_info, option)?; + let doc_freq = if let Some(alive_bitset) = alive_bitset_opt { + doc_freq_given_deletes(count_postings, alive_bitset) + } else { + // We do not need an exact document frequency here. + match count_postings.doc_freq() { + crate::postings::DocFreq::Exact(doc_freq) => doc_freq, + crate::postings::DocFreq::Approximate(_) => exact_doc_freq(count_postings), + } + }; + + if doc_freq == 0u32 { + return Ok(None); + } + + let postings = read_postings_for_merge(inverted_index, term_info, option)?; + Ok(Some((doc_freq, postings))) +} + +/// If the postings is not able to inform us of the document frequency, +/// we just scan through it. +pub(crate) fn exact_doc_freq(mut postings: Box) -> u32 { + let mut doc_freq = 0; + loop { + let doc = postings.doc(); + if doc == TERMINATED { + return doc_freq; + } + doc_freq += 1u32; + postings.advance(); + } +} + #[cfg(test)] mod tests { @@ -565,8 +615,10 @@ mod tests { BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE, }; use crate::collector::{Count, FacetCollector}; + use crate::fastfield::AliveBitSet; use crate::index::{Index, SegmentId}; use crate::indexer::NoMergePolicy; + use crate::postings::{DocFreq, Postings as _, SegmentPostings}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::schema::{ Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, @@ -681,32 +733,32 @@ mod tests { ); } { - let doc = searcher.doc::(DocAddress::new(0, 0))?; + let doc = searcher.doc(DocAddress::new(0, 0))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("af b") ); } { - let doc = searcher.doc::(DocAddress::new(0, 1))?; + let doc = searcher.doc(DocAddress::new(0, 1))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("a b c") ); } { - let doc = searcher.doc::(DocAddress::new(0, 2))?; + let doc = searcher.doc(DocAddress::new(0, 2))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("a b c d") ); } { - let doc = searcher.doc::(DocAddress::new(0, 3))?; + let doc = searcher.doc(DocAddress::new(0, 3))?; assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); } { - let doc = searcher.doc::(DocAddress::new(0, 4))?; + let doc = searcher.doc(DocAddress::new(0, 4))?; assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g")); } @@ -1518,10 +1570,10 @@ mod tests { let searcher = reader.searcher(); let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(searcher.segment_reader(0u32), 1.0)? + .term_scorer_for_test(searcher.segment_reader(0u32), 1.0) .unwrap(); assert_eq!(term_scorer.doc(), 0); - assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855); + assert_nearly_equals!(term_scorer.seek_block_max(0), 0.0079681855); assert_nearly_equals!(term_scorer.score(), 0.0079681855); for _ in 0..81 { writer.add_document(doc!(text=>"hello happy tax payer"))?; @@ -1534,13 +1586,13 @@ mod tests { for segment_reader in searcher.segment_readers() { let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(segment_reader, 1.0)? + .term_scorer_for_test(segment_reader.as_ref(), 1.0) .unwrap(); // the difference compared to before is intrinsic to the bm25 formula. no worries // there. for doc in segment_reader.doc_ids_alive() { assert_eq!(term_scorer.doc(), doc); - assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312); assert_nearly_equals!(term_scorer.score(), 0.003478312); term_scorer.advance(); } @@ -1560,12 +1612,12 @@ mod tests { let segment_reader = searcher.segment_reader(0u32); let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(segment_reader, 1.0)? + .term_scorer_for_test(segment_reader, 1.0) .unwrap(); // the difference compared to before is intrinsic to the bm25 formula. no worries there. for doc in segment_reader.doc_ids_alive() { assert_eq!(term_scorer.doc(), doc); - assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312); assert_nearly_equals!(term_scorer.score(), 0.003478312); term_scorer.advance(); } @@ -1579,4 +1631,19 @@ mod tests { assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0); assert!((super::MAX_DOC_LIMIT as i32) < 0); } + + #[test] + fn test_doc_freq_given_delete() { + let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); + assert_eq!(docs.doc_freq(), DocFreq::Exact(3)); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); + let docs_boxed: Box = + Box::new(SegmentPostings::create_from_docs(&[0, 2, 10])); + assert_eq!(super::doc_freq_given_deletes(docs_boxed, &alive_bitset), 2); + let all_deleted = + AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); + let docs_boxed: Box = + Box::new(SegmentPostings::create_from_docs(&[0, 2, 10])); + assert_eq!(super::doc_freq_given_deletes(docs_boxed, &all_deleted), 0); + } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index bf35c52bc..c26f10a14 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -139,9 +139,9 @@ fn merge( /// meant to work if you have an `IndexWriter` running for the origin indices, or /// the destination `Index`. #[doc(hidden)] -pub fn merge_indices>>( +pub fn merge_indices( indices: &[Index], - output_directory: T, + output_directory: Box, ) -> crate::Result { if indices.is_empty() { // If there are no indices to merge, there is no need to do anything. @@ -211,11 +211,11 @@ pub fn merge_filtered_segments>>( )); } - let mut merged_index = Index::create( - output_directory, - target_schema.clone(), - target_settings.clone(), - )?; + let mut merged_index: Index = Index::builder() + .schema(target_schema.clone()) + .settings(target_settings.clone()) + .create(output_directory.into())?; + let merged_segment = merged_index.new_segment(); let merged_segment_id = merged_segment.id(); let merger: IndexMerger = @@ -235,7 +235,6 @@ pub fn merge_filtered_segments>>( )) .trim_end() ); - let index_meta = IndexMeta { index_settings: target_settings, // index_settings of all segments should be the same segments: vec![segment_meta], @@ -275,7 +274,7 @@ impl SegmentUpdater { stamper: Stamper, delete_cursor: &DeleteCursor, num_merge_threads: usize, - ) -> crate::Result { + ) -> crate::Result { let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); let pool = ThreadPoolBuilder::new() @@ -930,7 +929,7 @@ mod tests { #[test] fn test_merge_empty_indices_array() { - let merge_result = merge_indices(&[], RamDirectory::default()); + let merge_result = merge_indices(&[], Box::new(RamDirectory::default())); assert!(merge_result.is_err()); } @@ -957,7 +956,10 @@ mod tests { }; // mismatched schema index list - let result = merge_indices(&[first_index, second_index], RamDirectory::default()); + let result = merge_indices( + &[first_index, second_index], + Box::new(RamDirectory::default()), + ); assert!(result.is_err()); Ok(()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 94e3f0de2..44f48de0f 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -12,7 +12,7 @@ use crate::indexer::segment_serializer::SegmentSerializer; use crate::json_utils::{index_json_value, IndexingPositionsPerPath}; use crate::postings::{ compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition, - PerFieldPostingsWriter, PostingsWriter, + PerFieldPostingsWriter, PostingsWriter, PostingsWriterEnum, }; use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED}; @@ -169,7 +169,7 @@ impl SegmentWriter { } let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); - let postings_writer: &mut dyn PostingsWriter = + let postings_writer: &mut PostingsWriterEnum = self.per_field_postings_writers.get_for_field_mut(field); term_buffer.clear_with_field(field); @@ -434,7 +434,7 @@ mod tests { Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value, DATE_TIME_PRECISION_INDEXED, FAST, STORED, STRING, TEXT, }; - use crate::store::{Compressor, StoreReader, StoreWriter}; + use crate::store::{Compressor, StoreWriter, TantivyStoreReader}; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; use crate::tokenizer::{PreTokenizedString, Token}; @@ -482,8 +482,8 @@ mod tests { store_writer.store(&doc, &schema).unwrap(); store_writer.close().unwrap(); - let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); - let doc = reader.get::(0).unwrap(); + let reader = TantivyStoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); + let doc = reader.get(0).unwrap(); assert_eq!(doc.field_values().count(), 2); assert_eq!( @@ -600,16 +600,12 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); let doc = searcher - .doc::(DocAddress { + .doc(DocAddress { segment_ord: 0u32, doc_id: 0u32, }) .unwrap(); - let serdeser_json_val = serde_json::from_str::(&doc.to_json(&schema)) - .unwrap() - .get("json") - .unwrap()[0] - .clone(); + let serdeser_json_val = doc.to_json(&schema).get("json").unwrap().clone(); assert_eq!(json_val, serdeser_json_val); let segment_reader = searcher.segment_reader(0u32); let inv_idx = segment_reader.inverted_index(json_field).unwrap(); @@ -871,7 +867,7 @@ mod tests { let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0u32); - fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) { + fn assert_type(reader: &dyn SegmentReader, field: &str, typ: ColumnType) { let cols = reader.fast_fields().dynamic_column_handles(field).unwrap(); assert_eq!(cols.len(), 1, "{field}"); assert_eq!(cols[0].column_type(), typ, "{field}"); @@ -890,7 +886,7 @@ mod tests { assert_type(segment_reader, "json.my_arr", ColumnType::I64); assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str); - fn assert_empty(reader: &SegmentReader, field: &str) { + fn assert_empty(reader: &dyn SegmentReader, field: &str) { let cols = reader.fast_fields().dynamic_column_handles(field).unwrap(); assert_eq!(cols.len(), 0); } diff --git a/src/indexer/single_segment_index_writer.rs b/src/indexer/single_segment_index_writer.rs index 673accae3..342b88a88 100644 --- a/src/indexer/single_segment_index_writer.rs +++ b/src/indexer/single_segment_index_writer.rs @@ -11,7 +11,7 @@ pub struct SingleSegmentIndexWriter { segment_writer: SegmentWriter, segment: Segment, opstamp: Opstamp, - _phantom: PhantomData, + _doc: PhantomData, } impl SingleSegmentIndexWriter { @@ -22,7 +22,7 @@ impl SingleSegmentIndexWriter { segment_writer, segment, opstamp: 0, - _phantom: PhantomData, + _doc: PhantomData, }) } @@ -40,7 +40,7 @@ impl SingleSegmentIndexWriter { pub fn finalize(self) -> crate::Result { let max_doc = self.segment_writer.max_doc(); self.segment_writer.finalize()?; - let segment: Segment = self.segment.with_max_doc(max_doc); + let segment = self.segment.with_max_doc(max_doc); let index = segment.index(); let index_meta = IndexMeta { index_settings: index.settings().clone(), diff --git a/src/lib.rs b/src/lib.rs index 93f0fa43b..ee6870c9f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -93,7 +93,7 @@ //! //! for (_score, doc_address) in top_docs { //! // Retrieve the actual content of documents given its `doc_address`. -//! let retrieved_doc = searcher.doc::(doc_address)?; +//! let retrieved_doc = searcher.doc(doc_address)?; //! println!("{}", retrieved_doc.to_json(&schema)); //! } //! @@ -166,6 +166,7 @@ mod functional_test; #[macro_use] mod macros; + mod future_result; // Re-exports @@ -223,11 +224,12 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; -pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration}; +pub use crate::core::{json_utils, Executor, Searcher, SearcherContext, SearcherGeneration}; pub use crate::directory::Directory; pub use crate::index::{ - Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment, - SegmentMeta, SegmentReader, + try_downcast_and_call, DynInvertedIndexReader, Index, IndexBuilder, IndexMeta, IndexSettings, + InvertedIndexReader, Order, Segment, SegmentMeta, SegmentReader, TantivyInvertedIndexReader, + TantivySegmentReader, TypedInvertedIndexReaderCb, }; pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; pub use crate::schema::{Document, TantivyDocument, Term}; @@ -547,7 +549,7 @@ pub mod tests { index_writer.commit()?; let reader = index.reader()?; let searcher = reader.searcher(); - let segment_reader: &SegmentReader = searcher.segment_reader(0); + let segment_reader: &dyn SegmentReader = searcher.segment_reader(0); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?; assert_eq!(fieldnorms_reader.fieldnorm(0), 3); assert_eq!(fieldnorms_reader.fieldnorm(1), 0); @@ -555,7 +557,7 @@ pub mod tests { Ok(()) } - fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool { + fn advance_undeleted(docset: &mut dyn DocSet, reader: &dyn SegmentReader) -> bool { let mut doc = docset.advance(); while doc != TERMINATED { if !reader.is_deleted(doc) { @@ -1072,7 +1074,7 @@ pub mod tests { } let reader = index.reader()?; let searcher = reader.searcher(); - let segment_reader: &SegmentReader = searcher.segment_reader(0); + let segment_reader: &dyn SegmentReader = searcher.segment_reader(0); { let fast_field_reader_res = segment_reader.fast_fields().u64("text"); assert!(fast_field_reader_res.is_err()); diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 47ace9975..f527d46c0 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,26 +1,17 @@ use std::io; -use common::VInt; +use common::{OwnedBytes, VInt}; -use crate::directory::{FileSlice, OwnedBytes}; +use super::FreqReadingOption; use crate::fieldnorm::FieldNormReader; -use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; -use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; +use crate::postings::compression::{BlockDecoder, VIntDecoder as _, COMPRESSION_BLOCK_SIZE}; +use crate::postings::skip::{BlockInfo, SkipReader}; use crate::query::Bm25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; -fn max_score>(mut it: I) -> Option { - it.next().map(|first| it.fold(first, Score::max)) -} - /// `BlockSegmentPostings` is a cursor iterating over blocks /// of documents. -/// -/// # Warning -/// -/// While it is useful for some very specific high-performance -/// use cases, you should prefer using `SegmentPostings` for most usage. #[derive(Clone)] pub struct BlockSegmentPostings { pub(crate) doc_decoder: BlockDecoder, @@ -88,19 +79,18 @@ fn split_into_skips_and_postings( } impl BlockSegmentPostings { - /// Opens a `BlockSegmentPostings`. + /// Opens a `StandardPostingsReader`. /// `doc_freq` is the number of documents in the posting list. /// `record_option` represents the amount of data available according to the schema. /// `requested_option` is the amount of data requested by the user. /// If for instance, we do not request for term frequencies, this function will not decompress /// term frequency blocks. - pub(crate) fn open( + pub fn open( doc_freq: u32, - data: FileSlice, + bytes: OwnedBytes, mut record_option: IndexRecordOption, requested_option: IndexRecordOption, ) -> io::Result { - let bytes = data.read_bytes()?; let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?; let skip_reader = match skip_data_opt { Some(skip_data) => { @@ -138,6 +128,87 @@ impl BlockSegmentPostings { block_segment_postings.load_block(); Ok(block_segment_postings) } +} + +fn max_score>(mut it: I) -> Option { + it.next().map(|first| it.fold(first, Score::max)) +} + +impl BlockSegmentPostings { + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. + /// + /// This `doc_freq` is simply the sum of the length of all of the blocks + /// length, and it does not take in account deleted documents. + pub fn doc_freq(&self) -> u32 { + self.doc_freq + } + + /// Returns the array of docs in the current block. + /// + /// Before the first call to `.advance()`, the block + /// returned by `.docs()` is empty. + #[inline] + pub fn docs(&self) -> &[DocId] { + debug_assert!(self.block_loaded); + self.doc_decoder.output_array() + } + + /// Return the document at index `idx` of the block. + #[inline] + pub fn doc(&self, idx: usize) -> u32 { + self.doc_decoder.output(idx) + } + + /// Return the array of `term freq` in the block. + #[inline] + pub fn freqs(&self) -> &[u32] { + debug_assert!(self.block_loaded); + self.freq_decoder.output_array() + } + + /// Return the frequency at index `idx` of the block. + #[inline] + pub fn freq(&self, idx: usize) -> u32 { + debug_assert!(self.block_loaded); + self.freq_decoder.output(idx) + } + + /// Position on a block that may contains `target_doc`. + /// + /// If all docs are smaller than target, the block loaded may be empty, + /// or be the last an incomplete VInt block. + pub fn seek(&mut self, target_doc: DocId) -> usize { + // Move to the block that might contain our document. + self.seek_block_without_loading(target_doc); + self.load_block(); + + // At this point we are on the block that might contain our document. + let doc = self.doc_decoder.seek_within_block(target_doc); + + // The last block is not full and padded with TERMINATED, + // so we are guaranteed to have at least one value (real or padding) + // that is >= target_doc. + debug_assert!(doc < COMPRESSION_BLOCK_SIZE); + + // `doc` is now the first element >= `target_doc`. + // If all docs are smaller than target, the current block is incomplete and padded + // with TERMINATED. After the search, the cursor points to the first TERMINATED. + doc + } + + /// Returns the current position offset in the position reader. + pub fn position_offset(&self) -> u64 { + self.skip_reader.position_offset() + } + + /// Advance to the next block. + pub fn advance(&mut self) { + self.skip_reader.advance(); + self.block_loaded = false; + self.block_max_score_cache = None; + self.load_block(); + } /// Returns the block_max_score for the current block. /// It does not require the block to be loaded. For instance, it is ok to call this method @@ -160,7 +231,7 @@ impl BlockSegmentPostings { } // this is the last block of the segment posting list. // If it is actually loaded, we can compute block max manually. - if self.block_is_loaded() { + if self.block_loaded { let docs = self.doc_decoder.output_array().iter().cloned(); let freqs = self.freq_decoder.output_array().iter().cloned(); let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| { @@ -177,112 +248,25 @@ impl BlockSegmentPostings { // We do not cache it however, so that it gets computed when once block is loaded. bm25_weight.max_score() } +} - pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { - self.freq_reading_option - } - - // Resets the block segment postings on another position - // in the postings file. - // - // This is useful for enumerating through a list of terms, - // and consuming the associated posting lists while avoiding - // reallocating a `BlockSegmentPostings`. - // - // # Warning - // - // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> { - let (skip_data_opt, postings_data) = - split_into_skips_and_postings(doc_freq, postings_data)?; - self.data = postings_data; - self.block_max_score_cache = None; - self.block_loaded = false; - if let Some(skip_data) = skip_data_opt { - self.skip_reader.reset(skip_data, doc_freq); - } else { - self.skip_reader.reset(OwnedBytes::empty(), doc_freq); +impl BlockSegmentPostings { + /// Returns an empty segment postings object + pub fn empty() -> BlockSegmentPostings { + BlockSegmentPostings { + doc_decoder: BlockDecoder::with_val(TERMINATED), + block_loaded: true, + freq_decoder: BlockDecoder::with_val(1), + freq_reading_option: FreqReadingOption::NoFreq, + block_max_score_cache: None, + doc_freq: 0, + data: OwnedBytes::empty(), + skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), } - self.doc_freq = doc_freq; - self.load_block(); - Ok(()) } - /// Returns the overall number of documents in the block postings. - /// It does not take in account whether documents are deleted or not. - /// - /// This `doc_freq` is simply the sum of the length of all of the blocks - /// length, and it does not take in account deleted documents. - pub fn doc_freq(&self) -> u32 { - self.doc_freq - } - - /// Returns the array of docs in the current block. - /// - /// Before the first call to `.advance()`, the block - /// returned by `.docs()` is empty. - #[inline] - pub fn docs(&self) -> &[DocId] { - debug_assert!(self.block_is_loaded()); - self.doc_decoder.output_array() - } - - /// Return the document at index `idx` of the block. - #[inline] - pub fn doc(&self, idx: usize) -> u32 { - self.doc_decoder.output(idx) - } - - /// Return the array of `term freq` in the block. - #[inline] - pub fn freqs(&self) -> &[u32] { - debug_assert!(self.block_is_loaded()); - self.freq_decoder.output_array() - } - - /// Return the frequency at index `idx` of the block. - #[inline] - pub fn freq(&self, idx: usize) -> u32 { - debug_assert!(self.block_is_loaded()); - self.freq_decoder.output(idx) - } - - /// Returns the length of the current block. - /// - /// All blocks have a length of `NUM_DOCS_PER_BLOCK`, - /// except the last block that may have a length - /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` - #[inline] - pub fn block_len(&self) -> usize { - debug_assert!(self.block_is_loaded()); - self.doc_decoder.output_len - } - - /// Position on a block that may contains `target_doc`. - /// - /// If all docs are smaller than target, the block loaded may be empty, - /// or be the last an incomplete VInt block. - pub fn seek(&mut self, target_doc: DocId) -> usize { - // Move to the block that might contain our document. - self.seek_block(target_doc); - self.load_block(); - - // At this point we are on the block that might contain our document. - let doc = self.doc_decoder.seek_within_block(target_doc); - - // The last block is not full and padded with TERMINATED, - // so we are guaranteed to have at least one value (real or padding) - // that is >= target_doc. - debug_assert!(doc < COMPRESSION_BLOCK_SIZE); - - // `doc` is now the first element >= `target_doc`. - // If all docs are smaller than target, the current block is incomplete and padded - // with TERMINATED. After the search, the cursor points to the first TERMINATED. - doc - } - - pub(crate) fn position_offset(&self) -> u64 { - self.skip_reader.position_offset() + pub(crate) fn skip_reader(&self) -> &SkipReader { + &self.skip_reader } /// Dangerous API! This calls seeks the next block on the skip list, @@ -291,19 +275,15 @@ impl BlockSegmentPostings { /// `.load_block()` needs to be called manually afterwards. /// If all docs are smaller than target, the block loaded may be empty, /// or be the last an incomplete VInt block. - pub(crate) fn seek_block(&mut self, target_doc: DocId) { + pub(crate) fn seek_block_without_loading(&mut self, target_doc: DocId) { if self.skip_reader.seek(target_doc) { self.block_max_score_cache = None; self.block_loaded = false; } } - pub(crate) fn block_is_loaded(&self) -> bool { - self.block_loaded - } - pub(crate) fn load_block(&mut self) { - if self.block_is_loaded() { + if self.block_loaded { return; } let offset = self.skip_reader.byte_offset(); @@ -351,68 +331,39 @@ impl BlockSegmentPostings { } self.block_loaded = true; } - - /// Advance to the next block. - pub fn advance(&mut self) { - self.skip_reader.advance(); - self.block_loaded = false; - self.block_max_score_cache = None; - self.load_block(); - } - - /// Returns an empty segment postings object - pub fn empty() -> BlockSegmentPostings { - BlockSegmentPostings { - doc_decoder: BlockDecoder::with_val(TERMINATED), - block_loaded: true, - freq_decoder: BlockDecoder::with_val(1), - freq_reading_option: FreqReadingOption::NoFreq, - block_max_score_cache: None, - doc_freq: 0, - data: OwnedBytes::empty(), - skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), - } - } - - pub(crate) fn skip_reader(&self) -> &SkipReader { - &self.skip_reader - } } #[cfg(test)] mod tests { - use common::HasLen; + use common::OwnedBytes; use super::BlockSegmentPostings; use crate::docset::{DocSet, TERMINATED}; - use crate::index::Index; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; - use crate::postings::postings::Postings; + use crate::postings::serializer::PostingsSerializer; use crate::postings::SegmentPostings; - use crate::schema::{IndexRecordOption, Schema, Term, INDEXED}; - use crate::DocId; + use crate::schema::IndexRecordOption; - #[test] - fn test_empty_segment_postings() { - let mut postings = SegmentPostings::empty(); - assert_eq!(postings.doc(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.doc_freq(), 0); - assert_eq!(postings.len(), 0); - } - - #[test] - fn test_empty_postings_doc_returns_terminated() { - let mut postings = SegmentPostings::empty(); - assert_eq!(postings.doc(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - } - - #[test] - fn test_empty_postings_doc_term_freq_returns_0() { - let postings = SegmentPostings::empty(); - assert_eq!(postings.term_freq(), 1); + #[cfg(test)] + fn build_block_postings(docs: &[u32]) -> BlockSegmentPostings { + let doc_freq = docs.len() as u32; + let mut postings_serializer = + PostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None); + postings_serializer.new_term(docs.len() as u32, false); + for doc in docs { + postings_serializer.write_doc(*doc, 1u32); + } + let mut buffer: Vec = Vec::new(); + postings_serializer + .close_term(doc_freq, &mut buffer) + .unwrap(); + BlockSegmentPostings::open( + doc_freq, + OwnedBytes::new(buffer), + IndexRecordOption::Basic, + IndexRecordOption::Basic, + ) + .unwrap() } #[test] @@ -427,7 +378,7 @@ mod tests { #[test] fn test_block_segment_postings() -> crate::Result<()> { - let mut block_segments = build_block_postings(&(0..100_000).collect::>())?; + let mut block_segments = build_block_postings(&(0..100_000).collect::>()); let mut offset: u32 = 0u32; // checking that the `doc_freq` is correct assert_eq!(block_segments.doc_freq(), 100_000); @@ -452,7 +403,7 @@ mod tests { doc_ids.push(129); doc_ids.push(130); { - let block_segments = build_block_postings(&doc_ids)?; + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.seek(128), 129); assert_eq!(docset.doc(), 129); @@ -461,7 +412,7 @@ mod tests { assert_eq!(docset.advance(), TERMINATED); } { - let block_segments = build_block_postings(&doc_ids).unwrap(); + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.seek(129), 129); assert_eq!(docset.doc(), 129); @@ -470,7 +421,7 @@ mod tests { assert_eq!(docset.advance(), TERMINATED); } { - let block_segments = build_block_postings(&doc_ids)?; + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.doc(), 0); assert_eq!(docset.seek(131), TERMINATED); @@ -479,38 +430,13 @@ mod tests { Ok(()) } - fn build_block_postings(docs: &[DocId]) -> crate::Result { - let mut schema_builder = Schema::builder(); - let int_field = schema_builder.add_u64_field("id", INDEXED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - let mut last_doc = 0u32; - for &doc in docs { - for _ in last_doc..doc { - index_writer.add_document(doc!(int_field=>1u64))?; - } - index_writer.add_document(doc!(int_field=>0u64))?; - last_doc = doc + 1; - } - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(int_field).unwrap(); - let term = Term::from_field_u64(int_field, 0u64); - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - let block_postings = inverted_index - .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; - Ok(block_postings) - } - #[test] fn test_block_segment_postings_seek() -> crate::Result<()> { - let mut docs = vec![0]; + let mut docs = Vec::new(); for i in 0..1300 { docs.push((i * i / 100) + i); } - let mut block_postings = build_block_postings(&docs[..])?; + let mut block_postings = build_block_postings(&docs[..]); for i in &[0, 424, 10000] { block_postings.seek(*i); let docs = block_postings.docs(); @@ -521,40 +447,4 @@ mod tests { assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); Ok(()) } - - #[test] - fn test_reset_block_segment_postings() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let int_field = schema_builder.add_u64_field("id", INDEXED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - // create two postings list, one containing even number, - // the other containing odd numbers. - for i in 0..6 { - let doc = doc!(int_field=> (i % 2) as u64); - index_writer.add_document(doc)?; - } - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let segment_reader = searcher.segment_reader(0); - - let mut block_segments; - { - let term = Term::from_field_u64(int_field, 0u64); - let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - block_segments = inverted_index - .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; - } - assert_eq!(block_segments.docs(), &[0, 2, 4]); - { - let term = Term::from_field_u64(int_field, 1u64); - let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?; - } - assert_eq!(block_segments.docs(), &[1, 3, 5]); - Ok(()) - } } diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index 99de17446..ca6c8248b 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -22,12 +22,6 @@ pub(crate) struct JsonPostingsWriter { non_str_posting_writer: SpecializedPostingsWriter, } -impl From> for Box { - fn from(json_postings_writer: JsonPostingsWriter) -> Box { - Box::new(json_postings_writer) - } -} - impl PostingsWriter for JsonPostingsWriter { #[inline] fn subscribe( diff --git a/src/postings/loaded_postings.rs b/src/postings/loaded_postings.rs index 7258f5cea..52bd4bb9e 100644 --- a/src/postings/loaded_postings.rs +++ b/src/postings/loaded_postings.rs @@ -1,5 +1,5 @@ use crate::docset::{DocSet, TERMINATED}; -use crate::postings::{Postings, SegmentPostings}; +use crate::postings::{DocFreq, Postings}; use crate::DocId; /// `LoadedPostings` is a `DocSet` and `Postings` implementation. @@ -25,16 +25,16 @@ impl LoadedPostings { /// Creates a new `LoadedPostings` from a `SegmentPostings`. /// /// It will also preload positions, if positions are available in the SegmentPostings. - pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings { - let num_docs = segment_postings.doc_freq() as usize; + pub fn load(postings: &mut Box) -> LoadedPostings { + let num_docs: usize = u32::from(postings.doc_freq()) as usize; let mut doc_ids = Vec::with_capacity(num_docs); let mut positions = Vec::with_capacity(num_docs); let mut position_offsets = Vec::with_capacity(num_docs); - while segment_postings.doc() != TERMINATED { + while postings.doc() != TERMINATED { position_offsets.push(positions.len() as u32); - doc_ids.push(segment_postings.doc()); - segment_postings.append_positions_with_offset(0, &mut positions); - segment_postings.advance(); + doc_ids.push(postings.doc()); + postings.append_positions_with_offset(0, &mut positions); + postings.advance(); } position_offsets.push(positions.len() as u32); LoadedPostings { @@ -101,6 +101,14 @@ impl Postings for LoadedPostings { output.push(*pos + offset); } } + + fn has_freq(&self) -> bool { + true + } + + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.doc_ids.len() as u32) + } } #[cfg(test)] diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 13b6761cf..9a248bd1a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -1,9 +1,16 @@ //! Postings module (also called inverted index) +use std::io; + +use common::OwnedBytes; + +use crate::fieldnorm::FieldNormReader; +use crate::positions::PositionReader; +use crate::query::Bm25Weight; +use crate::schema::IndexRecordOption; +use crate::Score; + mod block_search; - -pub(crate) use self::block_search::branchless_binary_search; - mod block_segment_postings; pub(crate) mod compression; mod indexing_context; @@ -16,22 +23,53 @@ mod recorder; mod segment_postings; /// Serializer module for the inverted index pub mod serializer; -mod skip; +pub(crate) mod skip; mod term_info; pub(crate) use loaded_postings::LoadedPostings; pub(crate) use stacker::compute_table_memory_size; +pub(crate) use self::block_search::branchless_binary_search; pub use self::block_segment_postings::BlockSegmentPostings; pub(crate) use self::indexing_context::IndexingContext; pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter; -pub use self::postings::Postings; -pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter}; +pub use self::postings::{DocFreq, Postings}; +pub(crate) use self::postings_writer::{ + serialize_postings, IndexingPosition, PostingsWriter, PostingsWriterEnum, +}; pub use self::segment_postings::SegmentPostings; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; -pub(crate) use self::skip::{BlockInfo, SkipReader}; pub use self::term_info::TermInfo; +/// Raw postings bytes and metadata read from storage. +#[derive(Debug, Clone)] +pub struct RawPostingsData { + /// Raw postings bytes for the term. + pub postings_data: OwnedBytes, + /// Raw positions bytes for the term, if positions are available. + pub positions_data: Option, + /// Record option of the indexed field. + pub record_option: IndexRecordOption, + /// Effective record option after downgrading to the indexed field capability. + pub effective_option: IndexRecordOption, +} + +/// A light complement interface to Postings to allow block-max wand acceleration. +pub trait PostingsWithBlockMax: Postings { + /// Moves the postings to the block containing `target_doc` and returns + /// an upperbound of the score for documents in the block. + fn seek_block_max( + &mut self, + target_doc: crate::DocId, + fieldnorm_reader: &FieldNormReader, + similarity_weight: &Bm25Weight, + ) -> Score; + + /// Returns the last document in the current block (or Terminated if this + /// is the last block). + fn last_doc_in_block(&self) -> crate::DocId; +} + #[expect(clippy::enum_variant_names)] #[derive(Debug, PartialEq, Clone, Copy, Eq)] pub(crate) enum FreqReadingOption { @@ -40,6 +78,27 @@ pub(crate) enum FreqReadingOption { ReadFreq, } +/// Load postings from raw data bytes into a `SegmentPostings` object. +pub fn load_postings_from_raw_data( + doc_freq: u32, + postings_data: RawPostingsData, +) -> io::Result { + let RawPostingsData { + postings_data, + positions_data: positions_data_opt, + record_option, + effective_option, + } = postings_data; + let requested_option = effective_option; + let block_segment_postings = + BlockSegmentPostings::open(doc_freq, postings_data, record_option, requested_option)?; + let position_reader = positions_data_opt.map(PositionReader::open).transpose()?; + Ok(SegmentPostings::from_block_postings( + block_segment_postings, + position_reader, + )) +} + #[cfg(test)] pub(crate) mod tests { use std::mem; @@ -47,9 +106,10 @@ pub(crate) mod tests { use super::{InvertedIndexSerializer, Postings}; use crate::docset::{DocSet, TERMINATED}; use crate::fieldnorm::FieldNormReader; - use crate::index::{Index, SegmentComponent, SegmentReader}; + use crate::index::{Index, SegmentComponent}; use crate::indexer::operation::AddOperation; use crate::indexer::SegmentWriter; + use crate::postings::DocFreq; use crate::query::Scorer; use crate::schema::{ Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT, @@ -259,7 +319,7 @@ pub(crate) mod tests { segment_writer.finalize()?; } { - let segment_reader = SegmentReader::open(&segment)?; + let segment_reader = crate::TantivySegmentReader::open(&segment)?; { let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?; assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); @@ -280,11 +340,11 @@ pub(crate) mod tests { } { let term_a = Term::from_field_text(text_field, "a"); - let mut postings_a = segment_reader + let mut postings_a: Box = segment_reader .inverted_index(term_a.field())? .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); - assert_eq!(postings_a.len(), 1000); + assert_eq!(postings_a.doc_freq(), DocFreq::Exact(1000)); assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); postings_a.positions(&mut positions); @@ -307,7 +367,7 @@ pub(crate) mod tests { .inverted_index(term_e.field())? .read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); - assert_eq!(postings_e.len(), 1000 - 2); + assert_eq!(postings_e.doc_freq(), DocFreq::Exact(1000 - 2)); for i in 2u32..1000u32 { assert_eq!(postings_e.term_freq(), i); postings_e.positions(&mut positions); diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index f3d6d6534..bf547cf5a 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -1,16 +1,15 @@ use crate::postings::json_postings_writer::JsonPostingsWriter; -use crate::postings::postings_writer::SpecializedPostingsWriter; +use crate::postings::postings_writer::{PostingsWriterEnum, SpecializedPostingsWriter}; use crate::postings::recorder::{DocIdRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; -use crate::postings::PostingsWriter; use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema}; pub(crate) struct PerFieldPostingsWriter { - per_field_postings_writers: Vec>, + per_field_postings_writers: Vec, } impl PerFieldPostingsWriter { pub fn for_schema(schema: &Schema) -> Self { - let per_field_postings_writers = schema + let per_field_postings_writers: Vec = schema .fields() .map(|(_, field_entry)| posting_writer_from_field_entry(field_entry)) .collect(); @@ -19,16 +18,16 @@ impl PerFieldPostingsWriter { } } - pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter { - self.per_field_postings_writers[field.field_id() as usize].as_ref() + pub(crate) fn get_for_field(&self, field: Field) -> &PostingsWriterEnum { + &self.per_field_postings_writers[field.field_id() as usize] } - pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter { - self.per_field_postings_writers[field.field_id() as usize].as_mut() + pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut PostingsWriterEnum { + &mut self.per_field_postings_writers[field.field_id() as usize] } } -fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box { +fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> PostingsWriterEnum { match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options .get_indexing_options() @@ -51,7 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::>::default(), + | FieldType::Facet(_) => >::default().into(), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { match text_indexing_option.index_option() { diff --git a/src/postings/postings.rs b/src/postings/postings.rs index 8606f00a9..e216aa9e9 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -1,5 +1,25 @@ use crate::docset::DocSet; +/// Result of the doc_freq method. +/// +/// Postings can inform us that the document frequency is approximate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DocFreq { + /// The document frequency is approximate. + Approximate(u32), + /// The document frequency is exact. + Exact(u32), +} + +impl From for u32 { + fn from(doc_freq: DocFreq) -> Self { + match doc_freq { + DocFreq::Approximate(approximate_doc_freq) => approximate_doc_freq, + DocFreq::Exact(doc_freq) => doc_freq, + } + } +} + /// Postings (also called inverted list) /// /// For a given term, it is the list of doc ids of the doc @@ -14,6 +34,9 @@ pub trait Postings: DocSet + 'static { /// The number of times the term appears in the document. fn term_freq(&self) -> u32; + /// Returns the number of documents containing the term in the segment. + fn doc_freq(&self) -> DocFreq; + /// Returns the positions offsetted with a given value. /// It is not necessary to clear the `output` before calling this method. /// The output vector will be resized to the `term_freq`. @@ -31,6 +54,16 @@ pub trait Postings: DocSet + 'static { fn positions(&mut self, output: &mut Vec) { self.positions_with_offset(0u32, output); } + + /// Returns true if the term_frequency is available. + /// + /// This is a tricky question, because on JSON fields, it is possible + /// for a text term to have term freq, whereas a number term in the field has none. + /// + /// This function returns whether the actual term has term frequencies or not. + /// In this above JSON field example, `has_freq` should return true for the + /// earlier and false for the latter. + fn has_freq(&self) -> bool; } impl Postings for Box { @@ -41,4 +74,12 @@ impl Postings for Box { fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { (**self).append_positions_with_offset(offset, output); } + + fn has_freq(&self) -> bool { + (**self).has_freq() + } + + fn doc_freq(&self) -> DocFreq { + (**self).doc_freq() + } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c7a94ecef..816f5c184 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -7,7 +7,10 @@ use stacker::Addr; use crate::fieldnorm::FieldNormReaders; use crate::indexer::indexing_term::IndexingTerm; use crate::indexer::path_to_unordered_id::OrderedPathId; -use crate::postings::recorder::{BufferLender, Recorder}; +use crate::postings::json_postings_writer::JsonPostingsWriter; +use crate::postings::recorder::{ + BufferLender, DocIdRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder, +}; use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, }; @@ -100,6 +103,141 @@ pub(crate) struct IndexingPosition { pub end_position: u32, } +pub enum PostingsWriterEnum { + DocId(SpecializedPostingsWriter), + DocIdTf(SpecializedPostingsWriter), + DocTfAndPosition(SpecializedPostingsWriter), + JsonDocId(JsonPostingsWriter), + JsonDocIdTf(JsonPostingsWriter), + JsonDocTfAndPosition(JsonPostingsWriter), +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_recorder_writer: SpecializedPostingsWriter) -> Self { + PostingsWriterEnum::DocId(doc_id_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_tf_recorder_writer: SpecializedPostingsWriter) -> Self { + PostingsWriterEnum::DocIdTf(doc_id_tf_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from( + doc_id_tf_and_positions_recorder_writer: SpecializedPostingsWriter, + ) -> Self { + PostingsWriterEnum::DocTfAndPosition(doc_id_tf_and_positions_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_recorder_writer: JsonPostingsWriter) -> Self { + PostingsWriterEnum::JsonDocId(doc_id_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_tf_recorder_writer: JsonPostingsWriter) -> Self { + PostingsWriterEnum::JsonDocIdTf(doc_id_tf_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from( + doc_id_tf_and_positions_recorder_writer: JsonPostingsWriter, + ) -> Self { + PostingsWriterEnum::JsonDocTfAndPosition(doc_id_tf_and_positions_recorder_writer) + } +} + +impl PostingsWriter for PostingsWriterEnum { + fn subscribe(&mut self, doc: DocId, pos: u32, term: &IndexingTerm, ctx: &mut IndexingContext) { + match self { + PostingsWriterEnum::DocId(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::DocIdTf(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::DocTfAndPosition(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocId(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocIdTf(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.subscribe(doc, pos, term, ctx) + } + } + } + + fn serialize( + &self, + term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], + ordered_id_to_path: &[&str], + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + match self { + PostingsWriterEnum::DocId(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::DocIdTf(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::DocTfAndPosition(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocId(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocIdTf(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + } + } + + /// Tokenize a text and subscribe all of its token. + fn index_text( + &mut self, + doc_id: DocId, + token_stream: &mut dyn TokenStream, + term_buffer: &mut IndexingTerm, + ctx: &mut IndexingContext, + indexing_position: &mut IndexingPosition, + ) { + match self { + PostingsWriterEnum::DocId(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::DocIdTf(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::DocTfAndPosition(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocId(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocIdTf(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + } + } + + fn total_num_tokens(&self) -> u64 { + match self { + PostingsWriterEnum::DocId(writer) => writer.total_num_tokens(), + PostingsWriterEnum::DocIdTf(writer) => writer.total_num_tokens(), + PostingsWriterEnum::DocTfAndPosition(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocId(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocIdTf(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocTfAndPosition(writer) => writer.total_num_tokens(), + } + } +} + /// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. /// @@ -171,14 +309,6 @@ pub(crate) struct SpecializedPostingsWriter { _recorder_type: PhantomData, } -impl From> for Box { - fn from( - specialized_postings_writer: SpecializedPostingsWriter, - ) -> Box { - Box::new(specialized_postings_writer) - } -} - impl SpecializedPostingsWriter { #[inline] pub(crate) fn serialize_one_term( diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 58610c139..76fff8844 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -70,7 +70,7 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ); /// Returns the number of document containing this term. @@ -113,7 +113,7 @@ impl Recorder for DocIdRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let buffer = buffer_lender.lend_u8(); @@ -181,7 +181,7 @@ impl Recorder for TermFrequencyRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let buffer = buffer_lender.lend_u8(); @@ -238,7 +238,7 @@ impl Recorder for TfAndPositionRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let (buffer_u8, buffer_positions) = buffer_lender.lend_all(); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index e8928b90d..c1dba8665 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,11 +1,13 @@ -use common::HasLen; +use common::BitSet; +use super::{BlockSegmentPostings, PostingsWithBlockMax}; use crate::docset::DocSet; -use crate::fastfield::AliveBitSet; +use crate::fieldnorm::FieldNormReader; use crate::positions::PositionReader; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; -use crate::postings::{BlockSegmentPostings, Postings}; -use crate::{DocId, TERMINATED}; +use crate::postings::{DocFreq, Postings}; +use crate::query::Bm25Weight; +use crate::{DocId, Score}; /// `SegmentPostings` represents the inverted list or postings associated with /// a term in a `Segment`. @@ -29,31 +31,6 @@ impl SegmentPostings { } } - /// Compute the number of non-deleted documents. - /// - /// This method will clone and scan through the posting lists. - /// (this is a rather expensive operation). - pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 { - let mut docset = self.clone(); - let mut doc_freq = 0; - loop { - let doc = docset.doc(); - if doc == TERMINATED { - return doc_freq; - } - if alive_bitset.is_alive(doc) { - doc_freq += 1u32; - } - docset.advance(); - } - } - - /// Returns the overall number of documents in the block postings. - /// It does not take in account whether documents are deleted or not. - pub fn doc_freq(&self) -> u32 { - self.block_cursor.doc_freq() - } - /// Creates a segment postings object with the given documents /// and no frequency encoded. /// @@ -64,11 +41,13 @@ impl SegmentPostings { /// buffer with the serialized data. #[cfg(test)] pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { - use crate::directory::FileSlice; - use crate::postings::serializer::PostingsSerializer; + use common::OwnedBytes; + use crate::schema::IndexRecordOption; let mut buffer = Vec::new(); { + use crate::postings::serializer::PostingsSerializer; + let mut postings_serializer = PostingsSerializer::new(0.0, IndexRecordOption::Basic, None); postings_serializer.new_term(docs.len() as u32, false); @@ -81,7 +60,7 @@ impl SegmentPostings { } let block_segment_postings = BlockSegmentPostings::open( docs.len() as u32, - FileSlice::from(buffer), + OwnedBytes::new(buffer), IndexRecordOption::Basic, IndexRecordOption::Basic, ) @@ -95,7 +74,8 @@ impl SegmentPostings { doc_and_tfs: &[(u32, u32)], fieldnorms: Option<&[u32]>, ) -> SegmentPostings { - use crate::directory::FileSlice; + use common::OwnedBytes; + use crate::fieldnorm::FieldNormReader; use crate::postings::serializer::PostingsSerializer; use crate::schema::IndexRecordOption; @@ -128,7 +108,7 @@ impl SegmentPostings { .unwrap(); let block_segment_postings = BlockSegmentPostings::open( doc_and_tfs.len() as u32, - FileSlice::from(buffer), + OwnedBytes::new(buffer), IndexRecordOption::WithFreqs, IndexRecordOption::WithFreqs, ) @@ -158,7 +138,6 @@ impl DocSet for SegmentPostings { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> DocId { - debug_assert!(self.block_cursor.block_is_loaded()); if self.cur == COMPRESSION_BLOCK_SIZE - 1 { self.cur = 0; self.block_cursor.advance(); @@ -197,13 +176,31 @@ impl DocSet for SegmentPostings { } fn size_hint(&self) -> u32 { - self.len() as u32 + self.doc_freq().into() } -} -impl HasLen for SegmentPostings { - fn len(&self) -> usize { - self.block_cursor.doc_freq() as usize + fn fill_bitset(&mut self, bitset: &mut BitSet) { + let bitset_max_value: DocId = bitset.max_value(); + loop { + let docs = self.block_cursor.docs(); + let Some(&last_doc) = docs.last() else { + break; + }; + if last_doc < bitset_max_value { + // All docs are within the range of the bitset + for &doc in docs { + bitset.insert(doc); + } + } else { + for &doc in docs { + if doc < bitset_max_value { + bitset.insert(doc); + } + } + break; + } + self.block_cursor.advance(); + } } } @@ -229,6 +226,13 @@ impl Postings for SegmentPostings { self.block_cursor.freq(self.cur) } + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. + #[inline(always)] + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.block_cursor.doc_freq()) + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { let term_freq = self.term_freq(); let prev_len = output.len(); @@ -252,24 +256,44 @@ impl Postings for SegmentPostings { } } } + + fn has_freq(&self) -> bool { + !self.block_cursor.freqs().is_empty() + } +} + +impl PostingsWithBlockMax for SegmentPostings { + #[inline] + fn seek_block_max( + &mut self, + target_doc: crate::DocId, + fieldnorm_reader: &FieldNormReader, + similarity_weight: &Bm25Weight, + ) -> Score { + self.block_cursor.seek_block_without_loading(target_doc); + self.block_cursor + .block_max_score(fieldnorm_reader, similarity_weight) + } + + #[inline] + fn last_doc_in_block(&self) -> crate::DocId { + self.block_cursor.skip_reader().last_doc_in_block() + } } #[cfg(test)] mod tests { - - use common::HasLen; - use super::SegmentPostings; use crate::docset::{DocSet, TERMINATED}; - use crate::fastfield::AliveBitSet; - use crate::postings::postings::Postings; + use crate::postings::Postings; #[test] fn test_empty_segment_postings() { let mut postings = SegmentPostings::empty(); + assert_eq!(postings.doc(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.len(), 0); + assert_eq!(postings.doc_freq(), crate::postings::DocFreq::Exact(0)); } #[test] @@ -284,15 +308,4 @@ mod tests { let postings = SegmentPostings::empty(); assert_eq!(postings.term_freq(), 1); } - - #[test] - fn test_doc_freq() { - let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); - assert_eq!(docs.doc_freq(), 3); - let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); - assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2); - let all_deleted = - AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); - assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); - } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 726cce03b..722712c33 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -8,7 +8,7 @@ use crate::directory::{CompositeWrite, WritePtr}; use crate::fieldnorm::FieldNormReader; use crate::index::Segment; use crate::positions::PositionSerializer; -use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; +use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE}; use crate::postings::skip::SkipSerializer; use crate::query::Bm25Weight; use crate::schema::{Field, FieldEntry, IndexRecordOption, Schema}; diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 3900fd40e..e4a3584bd 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -146,23 +146,6 @@ impl SkipReader { skip_reader } - pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) { - self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { - 0 - } else { - TERMINATED - }; - self.last_doc_in_previous_block = 0u32; - self.owned_read = data; - self.block_info = BlockInfo::VInt { num_docs: doc_freq }; - self.byte_offset = 0; - self.remaining_docs = doc_freq; - self.position_offset = 0u64; - if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { - self.read_block_info(); - } - } - // Returns the block max score for this block if available. // // The block max score is available for all full bitpacked block, diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 5431a3a1b..aad3fbb2a 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -2,7 +2,7 @@ use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::index::SegmentReader; use crate::query::boost_query::BoostScorer; use crate::query::explanation::does_not_match; -use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, Score}; /// Query that matches all of the documents. @@ -21,16 +21,16 @@ impl Query for AllQuery { pub struct AllWeight; impl Weight for AllWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let all_scorer = AllScorer::new(reader.max_doc()); if boost != 1.0 { - Ok(Box::new(BoostScorer::new(all_scorer, boost))) + Ok(box_scorer(BoostScorer::new(all_scorer, boost))) } else { - Ok(Box::new(all_scorer)) + Ok(box_scorer(all_scorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { if doc >= reader.max_doc() { return Err(does_not_match(doc)); } diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 5f1053fb6..a5e9e1529 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -5,12 +5,14 @@ use common::BitSet; use tantivy_fst::Automaton; use super::phrase_prefix_query::prefix_end; -use crate::index::SegmentReader; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; use crate::postings::TermInfo; use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight}; -use crate::schema::{Field, IndexRecordOption}; +use crate::schema::Field; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score, TantivyError}; +use crate::{DocId, DocSet, Score, TantivyError}; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight { @@ -67,7 +69,7 @@ where } /// Returns the term infos that match the automaton - pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result> { + pub fn get_match_term_infos(&self, reader: &dyn SegmentReader) -> crate::Result> { let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_stream = self.automaton_stream(term_dict)?; @@ -84,33 +86,42 @@ where A: Automaton + Send + Sync + 'static, A::State: Clone, { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_stream = self.automaton_stream(term_dict)?; - while term_stream.advance() { - let term_info = term_stream.value(); - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; + struct FillBitsetLoop<'a, 'b, A: Automaton> + where A::State: Clone + { + term_stream: &'a mut TermStreamer<'b, &'b A>, + bitset: &'a mut BitSet, + } + impl TypedInvertedIndexReaderCb> for FillBitsetLoop<'_, '_, A> + where A::State: Clone + { + fn call(&mut self, reader: &I) -> io::Result<()> { + while self.term_stream.advance() { + let term_info = self.term_stream.value(); + reader.fill_bitset_from_terminfo(term_info, self.bitset)?; } - for &doc in docs { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); + Ok(()) } } + try_downcast_and_call( + inverted_index.as_ref(), + &mut FillBitsetLoop { + term_stream: &mut term_stream, + bitset: &mut doc_bitset, + }, + )?; let doc_bitset = BitSetDocSet::from(doc_bitset); let const_scorer = ConstScorer::new(doc_bitset, boost); Ok(Box::new(const_scorer)) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) == doc { Ok(Explanation::new("AutomatonScorer", 1.0)) diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index d25034c8e..dee61b7ee 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -24,6 +24,13 @@ impl BitSetDocSet { self.cursor_bucket = bucket_addr; self.cursor_tinybitset = self.docs.tinyset(bucket_addr); } + + /// Returns the number of documents in the bitset. + /// + /// This call is not free: it will bitcount the number of bits in the bitset. + pub fn doc_freq(&self) -> u32 { + self.docs.len() as u32 + } } impl From for BitSetDocSet { diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 6b2f2d6e3..5f239dc9e 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -1,5 +1,6 @@ use std::ops::{Deref, DerefMut}; +use crate::postings::PostingsWithBlockMax; use crate::query::term_query::TermScorer; use crate::query::Scorer; use crate::{DocId, DocSet, Score, TERMINATED}; @@ -13,8 +14,8 @@ use crate::{DocId, DocSet, Score, TERMINATED}; /// We always have `before_pivot_len` < `pivot_len`. /// /// `None` is returned if we establish that no document can exceed the threshold. -fn find_pivot_doc( - term_scorers: &[TermScorerWithMaxScore], +fn find_pivot_doc( + term_scorers: &[TermScorerWithMaxScore], threshold: Score, ) -> Option<(usize, usize, DocId)> { let mut max_score = 0.0; @@ -46,8 +47,8 @@ fn find_pivot_doc( /// the next doc candidate defined by the min of `last_doc_in_block + 1` for /// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..]. /// Note: before and after calling this method, scorers need to be sorted by their `.doc()`. -fn block_max_was_too_low_advance_one_scorer( - scorers: &mut [TermScorerWithMaxScore], +fn block_max_was_too_low_advance_one_scorer( + scorers: &mut [TermScorerWithMaxScore], pivot_len: usize, ) { debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); @@ -82,7 +83,10 @@ fn block_max_was_too_low_advance_one_scorer( // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // except term_scorers[ord] that might be in advance compared to its ranks, // bubble up term_scorers[ord] in order to restore the ordering. -fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) { +fn restore_ordering( + term_scorers: &mut [TermScorerWithMaxScore], + ord: usize, +) { let doc = term_scorers[ord].doc(); for i in ord + 1..term_scorers.len() { if term_scorers[i].doc() >= doc { @@ -97,9 +101,10 @@ fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) { // If this works, return true. // If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the // pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`. -// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return. -fn align_scorers( - term_scorers: &mut Vec, +// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and +// return. +fn align_scorers( + term_scorers: &mut Vec>, pivot_doc: DocId, before_pivot_len: usize, ) -> bool { @@ -126,7 +131,10 @@ fn align_scorers( // Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc). // Advance term_scorers[..pivot_len] and out of these removes the terminated scores. // Restores the ordering of term_scorers. -fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, pivot_len: usize) { +fn advance_all_scorers_on_pivot( + term_scorers: &mut Vec>, + pivot_len: usize, +) { for term_scorer in &mut term_scorers[..pivot_len] { term_scorer.advance(); } @@ -145,12 +153,12 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, /// Implements the WAND (Weak AND) algorithm for dynamic pruning /// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes". /// Link: -pub fn block_wand( - mut scorers: Vec, +pub fn block_wand( + mut scorers: Vec>, mut threshold: Score, callback: &mut dyn FnMut(u32, Score) -> Score, ) { - let mut scorers: Vec = scorers + let mut scorers: Vec> = scorers .iter_mut() .map(TermScorerWithMaxScore::from) .collect(); @@ -166,10 +174,7 @@ pub fn block_wand( let block_max_score_upperbound: Score = scorers[..pivot_len] .iter_mut() - .map(|scorer| { - scorer.seek_block(pivot_doc); - scorer.block_max_score() - }) + .map(|scorer| scorer.seek_block_max(pivot_doc)) .sum(); // Beware after shallow advance, skip readers can be in advance compared to @@ -220,21 +225,22 @@ pub fn block_wand( /// - On a block, advance until the end and execute `callback` when the doc score is greater or /// equal to the `threshold`. pub fn block_wand_single_scorer( - mut scorer: TermScorer, + mut scorer: TermScorer, mut threshold: Score, callback: &mut dyn FnMut(u32, Score) -> Score, ) { let mut doc = scorer.doc(); + let mut block_max_score = scorer.seek_block_max(doc); loop { // We position the scorer on a block that can reach // the threshold. - while scorer.block_max_score() < threshold { + while block_max_score < threshold { let last_doc_in_block = scorer.last_doc_in_block(); if last_doc_in_block == TERMINATED { return; } doc = last_doc_in_block + 1; - scorer.seek_block(doc); + block_max_score = scorer.seek_block_max(doc); } // Seek will effectively load that block. doc = scorer.seek(doc); @@ -256,31 +262,33 @@ pub fn block_wand_single_scorer( } } doc += 1; - scorer.seek_block(doc); + block_max_score = scorer.seek_block_max(doc); } } -struct TermScorerWithMaxScore<'a> { - scorer: &'a mut TermScorer, +struct TermScorerWithMaxScore<'a, TPostings: PostingsWithBlockMax> { + scorer: &'a mut TermScorer, max_score: Score, } -impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> { - fn from(scorer: &'a mut TermScorer) -> Self { +impl<'a, TPostings: PostingsWithBlockMax> From<&'a mut TermScorer> + for TermScorerWithMaxScore<'a, TPostings> +{ + fn from(scorer: &'a mut TermScorer) -> Self { let max_score = scorer.max_score(); TermScorerWithMaxScore { scorer, max_score } } } -impl Deref for TermScorerWithMaxScore<'_> { - type Target = TermScorer; +impl Deref for TermScorerWithMaxScore<'_, TPostings> { + type Target = TermScorer; fn deref(&self) -> &Self::Target { self.scorer } } -impl DerefMut for TermScorerWithMaxScore<'_> { +impl DerefMut for TermScorerWithMaxScore<'_, TPostings> { fn deref_mut(&mut self) -> &mut Self::Target { self.scorer } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 062449b8a..fdb4be201 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -2,21 +2,21 @@ use std::collections::HashMap; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; -use crate::postings::FreqReadingOption; use crate::query::disjunction::Disjunction; use crate::query::explanation::does_not_match; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::term_query::TermScorer; -use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer}; +use crate::query::weight::for_each_docset_buffered; use crate::query::{ - intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur, - RequiredOptionalScorer, Scorer, Weight, + box_scorer, intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, + Explanation, Occur, RequiredOptionalScorer, Scorer, SumCombiner, Weight, }; use crate::{DocId, Score}; -enum SpecializedScorer { - TermUnion(Vec), - Other(Box), +#[derive(Copy, Clone)] +enum SumOrDoNothingCombiner { + Sum, + DoNothing, } fn scorer_disjunction( @@ -32,7 +32,7 @@ where if scorers.len() == 1 { return scorers.into_iter().next().unwrap(); // Safe unwrap. } - Box::new(Disjunction::new( + box_scorer(Disjunction::new( scorers, score_combiner, minimum_match_required, @@ -44,57 +44,60 @@ fn scorer_union( scorers: Vec>, score_combiner_fn: impl Fn() -> TScoreCombiner, num_docs: u32, -) -> SpecializedScorer +) -> Box where TScoreCombiner: ScoreCombiner, { - assert!(!scorers.is_empty()); - if scorers.len() == 1 { - return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand - } - - { - let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::()); - if is_all_term_queries { - let scorers: Vec = scorers - .into_iter() - .map(|scorer| *(scorer.downcast::().map_err(|_| ()).unwrap())) - .collect(); - if scorers - .iter() - .all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq) + match scorers.len() { + 0 => box_scorer(EmptyScorer), + 1 => scorers.into_iter().next().unwrap(), + _ => { + let combiner_opt: Option = if std::any::TypeId::of::< + TScoreCombiner, + >() == std::any::TypeId::of::< + SumCombiner, + >() { + Some(SumOrDoNothingCombiner::Sum) + } else if std::any::TypeId::of::() + == std::any::TypeId::of::() { - // Block wand is only available if we read frequencies. - return SpecializedScorer::TermUnion(scorers); + Some(SumOrDoNothingCombiner::DoNothing) } else { - return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( - scorers, - score_combiner_fn, - num_docs, - ))); + None + }; + if let Some(combiner) = combiner_opt { + if scorers.iter().all(|scorer| scorer.is::()) { + let scorers: Vec = scorers + .into_iter() + .map(|scorer| { + *scorer.downcast::().ok().expect( + "downcast failed despite the fact we already checked the type", + ) + }) + .collect(); + return match combiner { + SumOrDoNothingCombiner::Sum => box_scorer(BufferedUnionScorer::build( + scorers, + SumCombiner::default, + num_docs, + )), + SumOrDoNothingCombiner::DoNothing => { + box_scorer(BufferedUnionScorer::build( + scorers, + DoNothingCombiner::default, + num_docs, + )) + } + }; + } } + box_scorer(BufferedUnionScorer::build( + scorers, + score_combiner_fn, + num_docs, + )) } } - SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( - scorers, - score_combiner_fn, - num_docs, - ))) -} - -fn into_box_scorer( - scorer: SpecializedScorer, - score_combiner_fn: impl Fn() -> TScoreCombiner, - num_docs: u32, -) -> Box { - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let union_scorer = - BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs); - Box::new(union_scorer) - } - SpecializedScorer::Other(scorer) => scorer, - } } /// Returns the effective MUST scorer, accounting for removed AllScorers. @@ -110,7 +113,7 @@ fn effective_must_scorer( if must_scorers.is_empty() { if removed_all_scorer_count > 0 { // Had AllScorer(s) only - all docs match - Some(Box::new(AllScorer::new(max_doc))) + Some(box_scorer(AllScorer::new(max_doc))) } else { // No MUST constraint at all None @@ -128,28 +131,26 @@ fn effective_must_scorer( /// When `scoring_enabled` is false, we can just return AllScorer alone since /// we don't need score contributions from the should_scorer. fn effective_should_scorer_for_union( - should_scorer: SpecializedScorer, + should_scorer: Box, removed_all_scorer_count: usize, max_doc: DocId, num_docs: u32, score_combiner_fn: impl Fn() -> TScoreCombiner, scoring_enabled: bool, -) -> SpecializedScorer { +) -> Box { if removed_all_scorer_count > 0 { if scoring_enabled { // Need to union to get score contributions from both - let all_scorers: Vec> = vec![ - into_box_scorer(should_scorer, &score_combiner_fn, num_docs), - Box::new(AllScorer::new(max_doc)), - ]; - SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( + let all_scorers: Vec> = + vec![should_scorer, box_scorer(AllScorer::new(max_doc))]; + box_scorer(BufferedUnionScorer::build( all_scorers, score_combiner_fn, num_docs, - ))) + )) } else { // Scoring disabled - AllScorer alone is sufficient - SpecializedScorer::Other(Box::new(AllScorer::new(max_doc))) + box_scorer(AllScorer::new(max_doc)) } } else { should_scorer @@ -160,9 +161,9 @@ enum ShouldScorersCombinationMethod { // Should scorers are irrelevant. Ignored, // Only contributes to final score. - Optional(SpecializedScorer), + Optional(Box), // Regardless of score, the should scorers may impact whether a document is matching or not. - Required(SpecializedScorer), + Required(Box), } /// Weight associated to the `BoolQuery`. @@ -205,7 +206,7 @@ impl BooleanWeight { fn per_occur_scorers( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result>>> { let mut per_occur_scorers: HashMap>> = HashMap::new(); @@ -221,10 +222,10 @@ impl BooleanWeight { fn complex_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, score_combiner_fn: impl Fn() -> TComplexScoreCombiner, - ) -> crate::Result { + ) -> crate::Result> { let num_docs = reader.num_docs(); let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?; @@ -234,7 +235,7 @@ impl BooleanWeight { let must_special_scorer_counts = remove_and_count_all_and_empty_scorers(&mut must_scorers); if must_special_scorer_counts.num_empty_scorers > 0 { - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } let mut should_scorers = per_occur_scorers.remove(&Occur::Should).unwrap_or_default(); @@ -249,7 +250,7 @@ impl BooleanWeight { if exclude_special_scorer_counts.num_all_scorers > 0 { // We exclude all documents at one point. - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } let effective_minimum_number_should_match = self @@ -261,7 +262,7 @@ impl BooleanWeight { if effective_minimum_number_should_match > num_of_should_scorers { // We don't have enough scorers to satisfy the minimum number of should matches. // The request will match no documents. - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } match effective_minimum_number_should_match { 0 if num_of_should_scorers == 0 => ShouldScorersCombinationMethod::Ignored, @@ -281,12 +282,10 @@ impl BooleanWeight { must_scorers.append(&mut should_scorers); ShouldScorersCombinationMethod::Ignored } - _ => ShouldScorersCombinationMethod::Required(SpecializedScorer::Other( - scorer_disjunction( - should_scorers, - score_combiner_fn(), - effective_minimum_number_should_match, - ), + _ => ShouldScorersCombinationMethod::Required(scorer_disjunction( + should_scorers, + score_combiner_fn(), + effective_minimum_number_should_match, )), } }; @@ -303,8 +302,8 @@ impl BooleanWeight { reader.max_doc(), num_docs, ) - .unwrap_or_else(|| Box::new(EmptyScorer)); - SpecializedScorer::Other(boxed_scorer) + .unwrap_or_else(|| box_scorer(EmptyScorer)); + boxed_scorer } (ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => { // Optional SHOULD: contributes to scoring but not required for matching. @@ -329,16 +328,12 @@ impl BooleanWeight { Some(must_scorer) => { // Has MUST constraint: SHOULD only affects scoring. if self.scoring_enabled { - SpecializedScorer::Other(Box::new(RequiredOptionalScorer::< - _, - _, - TScoreCombiner, - >::new( + box_scorer(RequiredOptionalScorer::<_, _, TScoreCombiner>::new( must_scorer, - into_box_scorer(should_scorer, &score_combiner_fn, num_docs), - ))) + should_scorer, + )) } else { - SpecializedScorer::Other(must_scorer) + must_scorer } } } @@ -358,12 +353,7 @@ impl BooleanWeight { } Some(must_scorer) => { // Has MUST constraint: intersect MUST with SHOULD. - let should_boxed = - into_box_scorer(should_scorer, &score_combiner_fn, num_docs); - SpecializedScorer::Other(intersect_scorers( - vec![must_scorer, should_boxed], - num_docs, - )) + intersect_scorers(vec![must_scorer, should_scorer], num_docs) } } } @@ -372,19 +362,18 @@ impl BooleanWeight { return Ok(include_scorer); } - let include_scorer_boxed = into_box_scorer(include_scorer, &score_combiner_fn, num_docs); let scorer: Box = if exclude_scorers.len() == 1 { let exclude_scorer = exclude_scorers.pop().unwrap(); match exclude_scorer.downcast::() { // Cast to TermScorer succeeded - Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, *exclude_scorer)), + Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer, *exclude_scorer)), // We get back the original Box - Err(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, exclude_scorer)), + Err(exclude_scorer) => Box::new(Exclude::new(include_scorer, exclude_scorer)), } } else { - Box::new(Exclude::new(include_scorer_boxed, exclude_scorers)) + Box::new(Exclude::new(include_scorer, exclude_scorers)) }; - Ok(SpecializedScorer::Other(scorer)) + Ok(scorer) } } @@ -413,8 +402,7 @@ fn remove_and_count_all_and_empty_scorers( } impl Weight for BooleanWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - let num_docs = reader.num_docs(); + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) } else if self.weights.len() == 1 { @@ -426,18 +414,12 @@ impl Weight for BooleanWeight crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); @@ -459,47 +441,22 @@ impl Weight for BooleanWeight crate::Result<()> { - let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let mut union_scorer = BufferedUnionScorer::build( - term_scorers, - &self.score_combiner_fn, - reader.num_docs(), - ); - for_each_scorer(&mut union_scorer, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_scorer(scorer.as_mut(), callback); - } - } + let mut scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; + scorer.for_each(callback); Ok(()) } fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { - let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; + let mut scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; - - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let mut union_scorer = BufferedUnionScorer::build( - term_scorers, - &self.score_combiner_fn, - reader.num_docs(), - ); - for_each_docset_buffered(&mut union_scorer, &mut buffer, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback); - } - } + for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback); Ok(()) } @@ -516,18 +473,11 @@ impl Weight for BooleanWeight Score, ) -> crate::Result<()> { let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - super::block_wand(term_scorers, threshold, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_pruning_scorer(scorer.as_mut(), threshold, callback); - } - } + reader.for_each_pruning(threshold, scorer, callback); Ok(()) } } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 681881c11..f8d9297bd 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -1,8 +1,7 @@ -mod block_wand; +pub(crate) mod block_wand; mod boolean_query; mod boolean_weight; -pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer}; pub use self::boolean_query::BooleanQuery; pub use self::boolean_weight::BooleanWeight; @@ -16,8 +15,8 @@ mod tests { use crate::collector::{Count, TopDocs}; use crate::query::term_query::TermScorer; use crate::query::{ - AllScorer, EmptyScorer, EnableScoring, Intersection, Occur, Query, QueryParser, RangeQuery, - RequiredOptionalScorer, Scorer, SumCombiner, TermQuery, + AllScorer, BufferedUnionScorer, EmptyScorer, EnableScoring, Intersection, Occur, Query, + QueryParser, RangeQuery, RequiredOptionalScorer, Scorer, SumCombiner, TermQuery, }; use crate::schema::*; use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score}; @@ -62,6 +61,19 @@ mod tests { Ok(()) } + #[test] + pub fn test_boolean_termonly_union_specialization() -> crate::Result<()> { + let (index, text_field) = aux_test_helper()?; + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("a b")?; + let searcher = index.reader()?.searcher(); + let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?; + let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; + assert!(scorer.is::>()); + assert_eq!(query.count(&searcher)?, 4); + Ok(()) + } + #[test] pub fn test_boolean_termonly_intersection() -> crate::Result<()> { let (index, text_field) = aux_test_helper()?; diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 69847d750..40d8d7bd4 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -67,11 +67,11 @@ impl BoostWeight { } impl Weight for BoostWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { self.weight.scorer(reader, boost * self.boost) } - fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result { let underlying_explanation = self.weight.explain(reader, doc)?; let score = underlying_explanation.value() * self.boost; let mut explanation = @@ -80,7 +80,7 @@ impl Weight for BoostWeight { Ok(explanation) } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { self.weight.count(reader) } } diff --git a/src/query/const_score_query.rs b/src/query/const_score_query.rs index d07e6a96f..6ecba4a45 100644 --- a/src/query/const_score_query.rs +++ b/src/query/const_score_query.rs @@ -1,7 +1,7 @@ use std::fmt; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; -use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; /// `ConstScoreQuery` is a wrapper over a query to provide a constant score. @@ -63,12 +63,15 @@ impl ConstWeight { } impl Weight for ConstWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let inner_scorer = self.weight.scorer(reader, boost)?; - Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score))) + Ok(box_scorer(ConstScorer::new( + inner_scorer, + boost * self.score, + ))) } - fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(TantivyError::InvalidArgument(format!( @@ -81,7 +84,7 @@ impl Weight for ConstWeight { Ok(explanation) } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { self.weight.count(reader) } } diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs index 2fa1772bd..7728aa411 100644 --- a/src/query/empty_query.rs +++ b/src/query/empty_query.rs @@ -2,7 +2,7 @@ use super::Scorer; use crate::docset::TERMINATED; use crate::index::SegmentReader; use crate::query::explanation::does_not_match; -use crate::query::{EnableScoring, Explanation, Query, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Weight}; use crate::{DocId, DocSet, Score, Searcher}; /// `EmptyQuery` is a dummy `Query` in which no document matches. @@ -26,11 +26,11 @@ impl Query for EmptyQuery { /// It is useful for tests and handling edge cases. pub struct EmptyWeight; impl Weight for EmptyWeight { - fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result> { - Ok(Box::new(EmptyScorer)) + fn scorer(&self, _reader: &dyn SegmentReader, _boost: Score) -> crate::Result> { + Ok(box_scorer(EmptyScorer)) } - fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, _reader: &dyn SegmentReader, doc: DocId) -> crate::Result { Err(does_not_match(doc)) } } diff --git a/src/query/exist_query.rs b/src/query/exist_query.rs index 7eb09722c..3b369df46 100644 --- a/src/query/exist_query.rs +++ b/src/query/exist_query.rs @@ -3,7 +3,7 @@ use core::fmt::Debug; use columnar::{ColumnIndex, DynamicColumn}; use common::BitSet; -use super::{ConstScorer, EmptyScorer}; +use super::{box_scorer, ConstScorer, EmptyScorer}; use crate::docset::{DocSet, TERMINATED}; use crate::index::SegmentReader; use crate::query::all_query::AllScorer; @@ -98,7 +98,7 @@ pub struct ExistsWeight { } impl Weight for ExistsWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let fast_field_reader = reader.fast_fields(); let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?; if self.field_type == Type::Json && self.json_subpaths { @@ -117,7 +117,7 @@ impl Weight for ExistsWeight { } } if non_empty_columns.is_empty() { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); } // If any column is full, all docs match. @@ -128,9 +128,9 @@ impl Weight for ExistsWeight { { let all_scorer = AllScorer::new(max_doc); if boost != 1.0f32 { - return Ok(Box::new(BoostScorer::new(all_scorer, boost))); + return Ok(box_scorer(BoostScorer::new(all_scorer, boost))); } else { - return Ok(Box::new(all_scorer)); + return Ok(box_scorer(all_scorer)); } } @@ -138,7 +138,7 @@ impl Weight for ExistsWeight { // NOTE: A lower number may be better for very sparse columns if non_empty_columns.len() < 4 { let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc()); - return Ok(Box::new(ConstScorer::new(docset, boost))); + return Ok(box_scorer(ConstScorer::new(docset, boost))); } // If we have many dynamic columns, precompute a bitset of matching docs @@ -162,10 +162,10 @@ impl Weight for ExistsWeight { } } let docset = BitSetDocSet::from(doc_bitset); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 64fcf78dd..46547918a 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,7 +1,7 @@ use super::size_hint::estimate_intersection; use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::query::term_query::TermScorer; -use crate::query::{EmptyScorer, Scorer}; +use crate::query::{box_scorer, EmptyScorer, Scorer}; use crate::{DocId, Score}; /// Returns the intersection scorer. @@ -20,7 +20,7 @@ pub fn intersect_scorers( num_docs_segment: u32, ) -> Box { if scorers.is_empty() { - return Box::new(EmptyScorer); + return box_scorer(EmptyScorer); } if scorers.len() == 1 { return scorers.pop().unwrap(); @@ -29,7 +29,7 @@ pub fn intersect_scorers( scorers.sort_by_key(|scorer| scorer.cost()); let doc = go_to_first_doc(&mut scorers[..]); if doc == TERMINATED { - return Box::new(EmptyScorer); + return box_scorer(EmptyScorer); } // We know that we have at least 2 elements. let left = scorers.remove(0); @@ -38,14 +38,14 @@ pub fn intersect_scorers( .iter() .all(|&scorer| scorer.is::()); if all_term_scorers { - return Box::new(Intersection { + return box_scorer(Intersection { left: *(left.downcast::().map_err(|_| ()).unwrap()), right: *(right.downcast::().map_err(|_| ()).unwrap()), others: scorers, num_docs: num_docs_segment, }); } - Box::new(Intersection { + box_scorer(Intersection { left, right, others: scorers, diff --git a/src/query/mod.rs b/src/query/mod.rs index e33768950..a0eb82b75 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -2,7 +2,7 @@ mod all_query; mod automaton_weight; mod bitset; mod bm25; -mod boolean_query; +pub(crate) mod boolean_query; mod boost_query; mod const_score_query; mod disjunction; @@ -24,7 +24,7 @@ mod reqopt_scorer; mod scorer; mod set_query; mod size_hint; -mod term_query; +pub(crate) mod term_query; mod union; mod weight; @@ -53,17 +53,17 @@ pub use self::intersection::{intersect_scorers, Intersection}; pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery}; -pub use self::phrase_query::PhraseQuery; +pub use self::phrase_query::{PhraseQuery, PhraseScorer}; pub use self::query::{EnableScoring, Query, QueryClone}; pub use self::query_parser::{QueryParser, QueryParserError}; pub use self::range_query::*; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner}; -pub use self::scorer::Scorer; +pub use self::scorer::{box_scorer, Scorer}; pub use self::set_query::TermSetQuery; -pub use self::term_query::TermQuery; -pub use self::union::BufferedUnionScorer; +pub use self::term_query::{BoxedTermScorer, TermQuery, TermScorer}; +pub use self::union::{BufferedUnionScorer, SimpleUnion}; #[cfg(test)] pub use self::vec_docset::VecDocSet; pub use self::weight::Weight; diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 550abe5a3..680e0410c 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -8,7 +8,7 @@ use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::schema::document::{Document, Value}; use crate::schema::{Field, FieldType, IndexRecordOption, Term}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer}; -use crate::{DocAddress, Result, Searcher, TantivyDocument, TantivyError}; +use crate::{DocAddress, Result, Searcher, TantivyError}; #[derive(Debug, PartialEq)] struct ScoreTerm { @@ -129,7 +129,7 @@ impl MoreLikeThis { searcher: &Searcher, doc_address: DocAddress, ) -> Result> { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let field_to_values = doc.get_sorted_field_values(); self.retrieve_terms_from_doc_fields(searcher, &field_to_values) @@ -167,7 +167,7 @@ impl MoreLikeThis { term_frequencies: &mut HashMap, ) -> Result<()> { let schema = searcher.schema(); - let tokenizer_manager = searcher.index().tokenizers(); + let tokenizer_manager = searcher.tokenizers(); let field_entry = schema.get_field_entry(field); if !field_entry.is_indexed() { diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index f2df3433d..6499b124c 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -2,7 +2,7 @@ use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; -use crate::query::phrase_query::{intersection_count, PhraseScorer}; +use crate::query::phrase_query::{intersection_exists, PhraseScorer}; use crate::query::Scorer; use crate::{DocId, Score}; @@ -100,7 +100,6 @@ pub struct PhrasePrefixScorer { phrase_scorer: PhraseKind, suffixes: Vec, suffix_offset: u32, - phrase_count: u32, suffix_position_buffer: Vec, } @@ -144,7 +143,6 @@ impl PhrasePrefixScorer { phrase_scorer, suffixes, suffix_offset: (max_offset - suffix_pos) as u32, - phrase_count: 0, suffix_position_buffer: Vec::with_capacity(100), }; if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() { @@ -153,12 +151,7 @@ impl PhrasePrefixScorer { phrase_prefix_scorer } - pub fn phrase_count(&self) -> u32 { - self.phrase_count - } - fn matches_prefix(&mut self) -> bool { - let mut count = 0; let current_doc = self.doc(); let pos_matching = self.phrase_scorer.get_intersection(); for suffix in &mut self.suffixes { @@ -168,11 +161,12 @@ impl PhrasePrefixScorer { let doc = suffix.seek(current_doc); if doc == current_doc { suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer); - count += intersection_count(pos_matching, &self.suffix_position_buffer); + if intersection_exists(pos_matching, &self.suffix_position_buffer) { + return true; + } } } - self.phrase_count = count as u32; - count != 0 + false } } diff --git a/src/query/phrase_prefix_query/phrase_prefix_weight.rs b/src/query/phrase_prefix_query/phrase_prefix_weight.rs index 546eb89e8..2128898d1 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs @@ -1,12 +1,11 @@ use super::{prefix_end, PhrasePrefixScorer}; use crate::fieldnorm::FieldNormReader; use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::postings::Postings; use crate::query::bm25::Bm25Weight; -use crate::query::explanation::does_not_match; -use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::{box_scorer, EmptyScorer, Scorer, Weight}; use crate::schema::{IndexRecordOption, Term}; -use crate::{DocId, DocSet, Score}; +use crate::Score; pub struct PhrasePrefixWeight { phrase_terms: Vec<(usize, Term)>, @@ -32,10 +31,10 @@ impl PhrasePrefixWeight { } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { let field = self.phrase_terms[0].1.field(); if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { return Ok(fieldnorm_reader); } } @@ -44,15 +43,15 @@ impl PhrasePrefixWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result>> { + ) -> crate::Result>> { let similarity_weight_opt = self .similarity_weight_opt .as_ref() .map(|similarity_weight| similarity_weight.boost_by(boost)); let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let mut term_postings_list = Vec::new(); + let mut term_postings_list: Vec<(usize, Box)> = Vec::new(); for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader .inverted_index(term.field())? @@ -103,49 +102,32 @@ impl PhrasePrefixWeight { } } - Ok(Some(PhrasePrefixScorer::new( + Ok(Some(box_scorer(PhrasePrefixScorer::new( term_postings_list, similarity_weight_opt, fieldnorm_reader, suffixes, self.prefix.0, - ))) + )))) } } impl Weight for PhrasePrefixWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(scorer) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { - let scorer_opt = self.phrase_scorer(reader, 1.0)?; - if scorer_opt.is_none() { - return Err(does_not_match(doc)); - } - let mut scorer = scorer_opt.unwrap(); - if scorer.seek(doc) != doc { - return Err(does_not_match(doc)); - } - let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); - let phrase_count = scorer.phrase_count(); - let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score()); - if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { - explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); - } - Ok(explanation) - } } #[cfg(test)] mod tests { use crate::docset::TERMINATED; use crate::index::Index; + use crate::postings::Postings; + use crate::query::phrase_prefix_query::PhrasePrefixScorer; use crate::query::{EnableScoring, PhrasePrefixQuery, Query}; use crate::schema::{Schema, TEXT}; use crate::{DocSet, IndexWriter, Term}; @@ -186,14 +168,14 @@ mod tests { .phrase_prefix_query_weight(enable_scoring) .unwrap() .unwrap(); - let mut phrase_scorer = phrase_weight + let mut phrase_scorer_boxed = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let phrase_scorer: &mut PhrasePrefixScorer> = + phrase_scorer_boxed.as_any_mut().downcast_mut().unwrap(); assert_eq!(phrase_scorer.doc(), 1); - assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); assert_eq!(phrase_scorer.doc(), 2); - assert_eq!(phrase_scorer.phrase_count(), 1); assert_eq!(phrase_scorer.advance(), TERMINATED); Ok(()) } @@ -213,14 +195,15 @@ mod tests { .phrase_prefix_query_weight(enable_scoring) .unwrap() .unwrap(); - let mut phrase_scorer = phrase_weight + let mut phrase_scorer_boxed = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let phrase_scorer = phrase_scorer_boxed + .downcast_mut::>>() + .unwrap(); assert_eq!(phrase_scorer.doc(), 1); - assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); assert_eq!(phrase_scorer.doc(), 2); - assert_eq!(phrase_scorer.phrase_count(), 1); assert_eq!(phrase_scorer.advance(), TERMINATED); Ok(()) } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 938e34442..fe53e8887 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -5,7 +5,7 @@ pub mod regex_phrase_query; mod regex_phrase_weight; pub use self::phrase_query::PhraseQuery; -pub(crate) use self::phrase_scorer::intersection_count; +pub(crate) use self::phrase_scorer::intersection_exists; pub use self::phrase_scorer::PhraseScorer; pub use self::phrase_weight::PhraseWeight; diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 11321173c..adc7ce53d 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -126,7 +126,7 @@ impl PhraseQuery { }; let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt); if self.slop > 0 { - weight.slop(self.slop); + weight.set_slop(self.slop); } Ok(weight) } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index ff7def917..d1f9f887e 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -2,9 +2,9 @@ use std::cmp::Ordering; use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; -use crate::postings::Postings; +use crate::postings::{Postings, SegmentPostings as StandardPostings}; use crate::query::bm25::Bm25Weight; -use crate::query::{Intersection, Scorer}; +use crate::query::{Explanation, Intersection, Scorer}; use crate::{DocId, Score}; struct PostingsWithOffset { @@ -43,7 +43,14 @@ impl DocSet for PostingsWithOffset { } } -pub struct PhraseScorer { +/// `PhraseScorer` is a `Scorer` that matches documents that match a phrase query, and scores them +/// based on the number of times the phrase appears in the document and the fieldnorm of the +/// document. +/// +/// It is implemented as an intersection of the postings of each term in the +/// phrase, where the intersection condition is that the positions of the terms are next to each +/// other (or within a certain slop). +pub struct PhraseScorer { intersection_docset: Intersection, PostingsWithOffset>, num_terms: usize, left_positions: Vec, @@ -58,7 +65,7 @@ pub struct PhraseScorer { } /// Returns true if and only if the two sorted arrays contain a common element -fn intersection_exists(left: &[u32], right: &[u32]) -> bool { +pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool { let mut left_index = 0; let mut right_index = 0; while left_index < left.len() && right_index < right.len() { @@ -79,7 +86,7 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool { false } -pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize { +fn intersection_count(left: &[u32], right: &[u32]) -> usize { let mut left_index = 0; let mut right_index = 0; let mut count = 0; @@ -346,6 +353,9 @@ fn intersection_count_with_carrying_slop( impl PhraseScorer { // If similarity_weight is None, then scoring is disabled. + /// Creates a phrase scorer from term postings and phrase matching options. + /// + /// `slop` controls the maximum positional distance allowed between terms. pub fn new( term_postings: Vec<(usize, TPostings)>, similarity_weight_opt: Option, @@ -402,6 +412,7 @@ impl PhraseScorer { scorer } + /// Returns the number of phrases identified in the current matching doc. pub fn phrase_count(&self) -> u32 { self.phrase_count } @@ -584,6 +595,17 @@ impl Scorer for PhraseScorer { 1.0f32 } } + + fn explain(&mut self) -> Explanation { + let doc = self.doc(); + let phrase_count = self.phrase_count(); + let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc); + let mut explanation = Explanation::new("Phrase Scorer", self.score()); + if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { + explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); + } + explanation + } } #[cfg(test)] diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 4118f79f6..cf9326bc9 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -1,13 +1,43 @@ -use super::PhraseScorer; +use std::io; + use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; +use crate::postings::TermInfo; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; -use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; -use crate::schema::{IndexRecordOption, Term}; +use crate::query::{box_scorer, EmptyScorer, Explanation, Scorer, Weight}; +use crate::schema::Term; use crate::{DocId, DocSet, Score}; +struct BuildPhraseScorer<'a> { + term_infos: &'a [(usize, TermInfo)], + similarity_weight_opt: Option, + fieldnorm_reader: FieldNormReader, + slop: u32, +} + +impl TypedInvertedIndexReaderCb>> for BuildPhraseScorer<'_> { + fn call(&mut self, reader: &I) -> io::Result> { + let mut offset_and_term_postings = Vec::with_capacity(self.term_infos.len()); + for (offset, term_info) in self.term_infos { + let postings = reader.read_postings_from_terminfo( + term_info, + crate::schema::IndexRecordOption::WithFreqsAndPositions, + )?; + offset_and_term_postings.push((*offset, postings)); + } + let scorer = super::PhraseScorer::new( + offset_and_term_postings, + self.similarity_weight_opt.clone(), + self.fieldnorm_reader.clone(), + self.slop, + ); + Ok(box_scorer(scorer)) + } +} + pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, similarity_weight_opt: Option, @@ -21,18 +51,17 @@ impl PhraseWeight { phrase_terms: Vec<(usize, Term)>, similarity_weight_opt: Option, ) -> PhraseWeight { - let slop = 0; PhraseWeight { phrase_terms, similarity_weight_opt, - slop, + slop: 0, } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { let field = self.phrase_terms[0].1.field(); if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { return Ok(fieldnorm_reader); } } @@ -41,48 +70,69 @@ impl PhraseWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result>> { + ) -> crate::Result>> { let similarity_weight_opt = self .similarity_weight_opt .as_ref() .map(|similarity_weight| similarity_weight.boost_by(boost)); let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let mut term_postings_list = Vec::new(); - for &(offset, ref term) in &self.phrase_terms { - if let Some(postings) = reader - .inverted_index(term.field())? - .read_postings(term, IndexRecordOption::WithFreqsAndPositions)? - { - term_postings_list.push((offset, postings)); - } else { - return Ok(None); - } + + if self.phrase_terms.is_empty() { + return Ok(None); } - Ok(Some(PhraseScorer::new( - term_postings_list, + let field = self.phrase_terms[0].1.field(); + + if !self + .phrase_terms + .iter() + .all(|(_offset, term)| term.field() == field) + { + return Err(crate::TantivyError::InvalidArgument( + "All terms in a phrase query must belong to the same field".to_string(), + )); + } + + let inverted_index_reader = reader.inverted_index(field)?; + + let mut term_infos: Vec<(usize, TermInfo)> = Vec::with_capacity(self.phrase_terms.len()); + + for &(offset, ref term) in &self.phrase_terms { + let Some(term_info) = inverted_index_reader.get_term_info(term)? else { + return Ok(None); + }; + term_infos.push((offset, term_info)); + } + + let mut phrase_scorer_builder = BuildPhraseScorer { + term_infos: &term_infos, similarity_weight_opt, fieldnorm_reader, - self.slop, - ))) + slop: self.slop, + }; + let scorer = + try_downcast_and_call(inverted_index_reader.as_ref(), &mut phrase_scorer_builder)?; + + Ok(Some(scorer)) } - pub fn slop(&mut self, slop: u32) { + /// Sets the slop for the given PhraseWeight. + pub fn set_slop(&mut self, slop: u32) { self.slop = slop; } } impl Weight for PhraseWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(scorer) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let scorer_opt = self.phrase_scorer(reader, 1.0)?; if scorer_opt.is_none() { return Err(does_not_match(doc)); @@ -91,14 +141,7 @@ impl Weight for PhraseWeight { if scorer.seek(doc) != doc { return Err(does_not_match(doc)); } - let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); - let phrase_count = scorer.phrase_count(); - let mut explanation = Explanation::new("Phrase Scorer", scorer.score()); - if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { - explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); - } - Ok(explanation) + Ok(scorer.explain()) } } @@ -106,7 +149,8 @@ impl Weight for PhraseWeight { mod tests { use super::super::tests::create_index; use crate::docset::TERMINATED; - use crate::query::{EnableScoring, PhraseQuery}; + use crate::query::phrase_query::PhraseScorer; + use crate::query::{EnableScoring, PhraseQuery, Scorer}; use crate::{DocSet, Term}; #[test] @@ -121,9 +165,11 @@ mod tests { ]); let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); let phrase_weight = phrase_query.phrase_weight(enable_scoring).unwrap(); - let mut phrase_scorer = phrase_weight + let phrase_scorer_boxed: Box = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let mut phrase_scorer: Box = + phrase_scorer_boxed.downcast::().ok().unwrap(); assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); diff --git a/src/query/phrase_query/regex_phrase_weight.rs b/src/query/phrase_query/regex_phrase_weight.rs index 9cefc555a..9facb22a5 100644 --- a/src/query/phrase_query/regex_phrase_weight.rs +++ b/src/query/phrase_query/regex_phrase_weight.rs @@ -5,14 +5,16 @@ use tantivy_fst::Regex; use super::PhraseScorer; use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::{LoadedPostings, Postings, SegmentPostings, TermInfo}; +use crate::index::{InvertedIndexReader, SegmentReader}; +use crate::postings::{LoadedPostings, Postings, TermInfo}; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; use crate::query::union::{BitSetPostingUnion, SimpleUnion}; -use crate::query::{AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::{ + box_scorer, AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight, +}; use crate::schema::{Field, IndexRecordOption}; -use crate::{DocId, DocSet, InvertedIndexReader, Score}; +use crate::{DocId, DocSet, DynInvertedIndexReader, Score}; type UnionType = SimpleUnion>; @@ -45,9 +47,9 @@ impl RegexPhraseWeight { } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(self.field) { return Ok(fieldnorm_reader); } } @@ -56,7 +58,7 @@ impl RegexPhraseWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result>> { let similarity_weight_opt = self @@ -84,7 +86,8 @@ impl RegexPhraseWeight { "Phrase query exceeded max expansions {num_terms}" ))); } - let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?; + let union = + Self::get_union_from_term_infos(&term_infos, reader, inverted_index.as_ref())?; posting_lists.push((offset, union)); } @@ -99,22 +102,11 @@ impl RegexPhraseWeight { /// Add all docs of the term to the docset fn add_to_bitset( - inverted_index: &InvertedIndexReader, + inverted_index: &(impl InvertedIndexReader + ?Sized), term_info: &TermInfo, doc_bitset: &mut BitSet, ) -> crate::Result<()> { - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - for &doc in docs { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); - } + inverted_index.fill_bitset_from_terminfo(term_info, doc_bitset)?; Ok(()) } @@ -174,8 +166,8 @@ impl RegexPhraseWeight { /// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently. pub(crate) fn get_union_from_term_infos( term_infos: &[TermInfo], - reader: &SegmentReader, - inverted_index: &InvertedIndexReader, + reader: &dyn SegmentReader, + inverted_index: &dyn DynInvertedIndexReader, ) -> crate::Result { let max_doc = reader.max_doc(); @@ -188,16 +180,19 @@ impl RegexPhraseWeight { // - Bucket 1: Terms appearing in 0.1% to 1% of documents // - Bucket 2: Terms appearing in 1% to 10% of documents // - Bucket 3: Terms appearing in more than 10% of documents - let mut buckets: Vec<(BitSet, Vec)> = (0..4) + let mut buckets: Vec<(BitSet, Vec>)> = (0..4) .map(|_| (BitSet::with_max_value(max_doc), Vec::new())) .collect(); const SPARSE_TERM_DOC_THRESHOLD: u32 = 100; for term_info in term_infos { - let mut term_posting = inverted_index - .read_postings_from_terminfo(term_info, IndexRecordOption::WithFreqsAndPositions)?; - let num_docs = term_posting.doc_freq(); + let mut term_posting = crate::index::load_postings_from_terminfo( + inverted_index, + term_info, + IndexRecordOption::WithFreqsAndPositions, + )?; + let num_docs = u32::from(term_posting.doc_freq()); if num_docs < SPARSE_TERM_DOC_THRESHOLD { let current_bucket = &mut sparse_buckets[0]; @@ -269,15 +264,15 @@ impl RegexPhraseWeight { } impl Weight for RegexPhraseWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(box_scorer(scorer)) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let scorer_opt = self.phrase_scorer(reader, 1.0)?; if scorer_opt.is_none() { return Err(does_not_match(doc)); diff --git a/src/query/query.rs b/src/query/query.rs index 32f74536f..476887d24 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -146,7 +146,7 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug { let weight = self.weight(EnableScoring::disabled_from_searcher(searcher))?; let mut result = 0; for reader in searcher.segment_readers() { - result += weight.count(reader)? as usize; + result += weight.count(reader.as_ref())? as usize; } Ok(result) } diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index a597c8dca..ffdec13d8 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -5,13 +5,15 @@ use common::bounds::{map_bound, BoundsRange}; use common::BitSet; use super::range_query_fastfield::FastFieldRangeWeight; -use crate::index::SegmentReader; +use crate::index::{InvertedIndexReader as _, SegmentReader}; use crate::query::explanation::does_not_match; use crate::query::range_query::is_type_valid_for_fastfield_range_query; -use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight}; -use crate::schema::{Field, IndexRecordOption, Term, Type}; +use crate::query::{ + box_scorer, BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight, +}; +use crate::schema::{Field, Term, Type}; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score}; +use crate::{DocId, DocSet, Score}; /// `RangeQuery` matches all documents that have at least one term within a defined range. /// @@ -212,7 +214,7 @@ impl InvertedIndexRangeWeight { } impl Weight for InvertedIndexRangeWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); @@ -228,24 +230,13 @@ impl Weight for InvertedIndexRangeWeight { } processed_count += 1; let term_info = term_range.value(); - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - for &doc in block_segment_postings.docs() { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); - } + inverted_index.fill_bitset_from_terminfo(term_info, &mut doc_bitset)?; } let doc_bitset = BitSetDocSet::from(doc_bitset); - Ok(Box::new(ConstScorer::new(doc_bitset, boost))) + Ok(box_scorer(ConstScorer::new(doc_bitset, boost))) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); @@ -686,7 +677,7 @@ mod tests { .weight(EnableScoring::disabled_from_schema(&schema)) .unwrap(); let range_scorer = range_weight - .scorer(&searcher.segment_readers()[0], 1.0f32) + .scorer(searcher.segment_readers()[0].as_ref(), 1.0f32) .unwrap(); range_scorer }; diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index 5ac715277..62d744a45 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -13,7 +13,8 @@ use common::bounds::{BoundsRange, TransformBound}; use super::fast_field_range_doc_set::RangeDocSet; use crate::query::{ - AllScorer, ConstScorer, EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight, + box_scorer, AllScorer, ConstScorer, EmptyScorer, EnableScoring, Explanation, Query, Scorer, + Weight, }; use crate::schema::{Type, ValueBytes}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; @@ -52,10 +53,10 @@ impl FastFieldRangeWeight { } impl Weight for FastFieldRangeWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { // Check if both bounds are Bound::Unbounded if self.bounds.is_unbounded() { - return Ok(Box::new(AllScorer::new(reader.max_doc()))); + return Ok(box_scorer(AllScorer::new(reader.max_doc()))); } let term = self @@ -95,7 +96,7 @@ impl Weight for FastFieldRangeWeight { let Some(str_dict_column): Option = reader.fast_fields().str(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let dict = str_dict_column.dictionary(); @@ -107,7 +108,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader .u64_lenient_for_type(Some(&[ColumnType::Str]), &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) } @@ -119,7 +120,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader .u64_lenient_for_type(Some(&[ColumnType::DateTime]), &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let bounds = bounds.map_bound(|term| term.as_date().unwrap().to_u64()); search_on_u64_ff( @@ -146,7 +147,7 @@ impl Weight for FastFieldRangeWeight { let Some(ip_addr_column): Option> = reader.fast_fields().column_opt(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let value_range = bound_range_inclusive_ip( &bounds.lower_bound, @@ -155,11 +156,11 @@ impl Weight for FastFieldRangeWeight { ip_addr_column.max_value(), ); let docset = RangeDocSet::new(value_range, ip_addr_column); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } else if field_type.is_str() { let Some(str_dict_column): Option = reader.fast_fields().str(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let dict = str_dict_column.dictionary(); @@ -171,7 +172,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader.u64_lenient_for_type(None, &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) } else if field_type.is_bytes() { @@ -228,7 +229,7 @@ impl Weight for FastFieldRangeWeight { &field_name, )? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff( column, @@ -238,7 +239,7 @@ impl Weight for FastFieldRangeWeight { } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(TantivyError::InvalidArgument(format!( @@ -255,7 +256,7 @@ impl Weight for FastFieldRangeWeight { /// /// Convert into fast field value space and search. fn search_on_json_numerical_field( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, typ: Type, bounds: BoundsRange>>, @@ -269,7 +270,7 @@ fn search_on_json_numerical_field( let Some((column, col_type)) = fast_field_reader.u64_lenient_for_type(allowed_column_types, field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let actual_column_type: NumericalType = col_type .numerical_type() @@ -427,18 +428,18 @@ fn search_on_u64_ff( ) .unwrap_or(1..=0); // empty range if value_range.is_empty() { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); } if col_min_value >= *value_range.start() && col_max_value <= *value_range.end() { // all values in the column are within the range. if column.index.get_cardinality() == Cardinality::Full { if boost != 1.0f32 { - return Ok(Box::new(ConstScorer::new( + return Ok(box_scorer(ConstScorer::new( AllScorer::new(column.num_docs()), boost, ))); } else { - return Ok(Box::new(AllScorer::new(column.num_docs()))); + return Ok(box_scorer(AllScorer::new(column.num_docs()))); } } else { // TODO Make it a field presence request for that specific column @@ -446,7 +447,7 @@ fn search_on_u64_ff( } let docset = RangeDocSet::new(value_range, column); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } /// Returns true if the type maps to a u64 fast field diff --git a/src/query/scorer.rs b/src/query/scorer.rs index e91fc2fbc..b4fcdfa47 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -1,9 +1,11 @@ +use std::mem::{transmute_copy, ManuallyDrop}; use std::ops::DerefMut; use downcast_rs::impl_downcast; use crate::docset::DocSet; -use crate::Score; +use crate::query::Explanation; +use crate::{DocId, Score, TERMINATED}; /// Scored set of documents matching a query within a specific segment. /// @@ -13,6 +15,53 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static { /// /// This method will perform a bit of computation and is not cached. fn score(&mut self) -> Score; + + /// Calls `callback` with all of the `(doc, score)` for which score + /// is exceeding a given threshold. + /// + /// This method is useful for the TopDocs collector. + /// For all docsets, the blanket implementation has the benefit + /// of prefiltering (doc, score) pairs, avoiding the + /// virtual dispatch cost. + /// + /// More importantly, it makes it possible for scorers to implement + /// important optimization (e.g. BlockWAND for union). + fn for_each_pruning( + &mut self, + threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + for_each_pruning_scorer_default_impl(self, threshold, callback); + } + + /// Calls `callback` with all of the `(doc, score)` in the scorer. + fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) { + let mut doc = self.doc(); + while doc != TERMINATED { + callback(doc, self.score()); + doc = self.advance(); + } + } + + /// Returns an explanation for the score of the current document. + fn explain(&mut self) -> Explanation { + let score = self.score(); + let name = std::any::type_name_of_val(self); + Explanation::new(name, score) + } +} + +/// Boxes a scorer. Prefer this to Box::new as it avoids double boxing +/// when TScorer is already a Box. +pub fn box_scorer(scorer: TScorer) -> Box { + if std::any::TypeId::of::() == std::any::TypeId::of::>() { + unsafe { + let forget_me = ManuallyDrop::new(scorer); + transmute_copy::>(&forget_me) + } + } else { + Box::new(scorer) + } } impl_downcast!(Scorer); @@ -22,4 +71,41 @@ impl Scorer for Box { fn score(&mut self) -> Score { self.deref_mut().score() } + + fn for_each_pruning( + &mut self, + threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + self.deref_mut().for_each_pruning(threshold, callback); + } + + fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) { + self.deref_mut().for_each(callback); + } +} + +/// Calls `callback` with all of the `(doc, score)` for which score +/// is exceeding a given threshold. +/// +/// This method is useful for the [`TopDocs`](crate::collector::TopDocs) collector. +/// For all docsets, the blanket implementation has the benefit +/// of prefiltering (doc, score) pairs, avoiding the +/// virtual dispatch cost. +/// +/// More importantly, it makes it possible for scorers to implement +/// important optimization (e.g. BlockWAND for union). +pub(crate) fn for_each_pruning_scorer_default_impl( + scorer: &mut TScorer, + mut threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, +) { + let mut doc = scorer.doc(); + while doc != TERMINATED { + let score = scorer.score(); + if score > threshold { + threshold = callback(doc, score); + } + doc = scorer.advance(); + } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 0811725be..0f9978b62 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -3,10 +3,10 @@ mod term_scorer; mod term_weight; pub use self::term_query::TermQuery; -pub use self::term_scorer::TermScorer; +pub use self::term_scorer::{BoxedTermScorer, TermScorer}; + #[cfg(test)] mod tests { - use crate::collector::TopDocs; use crate::docset::DocSet; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index a75648348..8da548662 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -1,23 +1,44 @@ use crate::docset::DocSet; use crate::fieldnorm::FieldNormReader; -use crate::postings::{FreqReadingOption, Postings, SegmentPostings}; +use crate::postings::{Postings, PostingsWithBlockMax, SegmentPostings}; use crate::query::bm25::Bm25Weight; -use crate::query::{Explanation, Scorer}; +use crate::query::{box_scorer, Explanation, Scorer}; use crate::{DocId, Score}; +/// Type-erased term scorer guaranteed to wrap a Tantivy [`TermScorer`]. +pub struct BoxedTermScorer(Box); + +impl BoxedTermScorer { + /// Creates a boxed term scorer from a concrete Tantivy [`TermScorer`]. + pub fn new(term_scorer: TermScorer) -> BoxedTermScorer { + BoxedTermScorer(box_scorer(term_scorer)) + } + + /// Converts this boxed term scorer into a generic boxed scorer. + pub fn into_boxed_scorer(self) -> Box { + self.0 + } +} + #[derive(Clone)] -pub struct TermScorer { - postings: SegmentPostings, +/// Scorer for a single term over a postings list. +/// +/// `TermScorer` combines postings data, fieldnorms, and BM25 term weight to +/// produce per-document scores. +pub struct TermScorer { + postings: TPostings, fieldnorm_reader: FieldNormReader, similarity_weight: Bm25Weight, } -impl TermScorer { +impl TermScorer { + /// Creates a new term scorer from postings, fieldnorm reader, and BM25 + /// term weight. pub fn new( - postings: SegmentPostings, + postings: TPostings, fieldnorm_reader: FieldNormReader, similarity_weight: Bm25Weight, - ) -> TermScorer { + ) -> TermScorer { TermScorer { postings, fieldnorm_reader, @@ -25,10 +46,38 @@ impl TermScorer { } } - pub(crate) fn seek_block(&mut self, target_doc: DocId) { - self.postings.block_cursor.seek_block(target_doc); + /// Returns the term frequency for the current document. + pub fn term_freq(&self) -> u32 { + self.postings.term_freq() } + /// Returns the fieldnorm id for the current document. + pub fn fieldnorm_id(&self) -> u8 { + self.fieldnorm_reader.fieldnorm_id(self.doc()) + } + + /// Returns the maximum score upper bound for this scorer. + pub fn max_score(&self) -> Score { + self.similarity_weight.max_score() + } +} + +impl TermScorer { + pub(crate) fn last_doc_in_block(&self) -> DocId { + self.postings.last_doc_in_block() + } + + /// Advances the term scorer to the block containing target_doc and returns + /// an upperbound for the score all of the documents in the block. + /// (BlockMax). This score is not guaranteed to be the + /// effective maximum score of the block. + pub(crate) fn seek_block_max(&mut self, target_doc: DocId) -> Score { + self.postings + .seek_block_max(target_doc, &self.fieldnorm_reader, &self.similarity_weight) + } +} + +impl TermScorer { #[cfg(test)] pub fn create_for_test( doc_and_tfs: &[(DocId, u32)], @@ -49,55 +98,9 @@ impl TermScorer { let fieldnorm_reader = FieldNormReader::for_test(fieldnorms); TermScorer::new(segment_postings, fieldnorm_reader, similarity_weight) } - - /// See `FreqReadingOption`. - pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { - self.postings.block_cursor.freq_reading_option() - } - - /// Returns the maximum score for the current block. - /// - /// In some rare case, the result may not be exact. In this case a lower value is returned, - /// (and may lead us to return a lesser document). - /// - /// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the - /// score assuming the average fieldnorm computed on this segment. - /// - /// Though extremely rare, it is theoretically possible that the actual average fieldnorm - /// is different enough from the current segment average fieldnorm that the maximum over a - /// specific is achieved on a different document. - /// - /// (The result is on the other hand guaranteed to be correct if there is only one segment). - pub fn block_max_score(&mut self) -> Score { - self.postings - .block_cursor - .block_max_score(&self.fieldnorm_reader, &self.similarity_weight) - } - - pub fn term_freq(&self) -> u32 { - self.postings.term_freq() - } - - pub fn fieldnorm_id(&self) -> u8 { - self.fieldnorm_reader.fieldnorm_id(self.doc()) - } - - pub fn explain(&self) -> Explanation { - let fieldnorm_id = self.fieldnorm_id(); - let term_freq = self.term_freq(); - self.similarity_weight.explain(fieldnorm_id, term_freq) - } - - pub fn max_score(&self) -> Score { - self.similarity_weight.max_score() - } - - pub fn last_doc_in_block(&self) -> DocId { - self.postings.block_cursor.skip_reader().last_doc_in_block() - } } -impl DocSet for TermScorer { +impl DocSet for TermScorer { #[inline] fn advance(&mut self) -> DocId { self.postings.advance() @@ -119,13 +122,19 @@ impl DocSet for TermScorer { } } -impl Scorer for TermScorer { +impl Scorer for TermScorer { #[inline] fn score(&mut self) -> Score { let fieldnorm_id = self.fieldnorm_id(); let term_freq = self.term_freq(); self.similarity_weight.score(fieldnorm_id, term_freq) } + + fn explain(&mut self) -> Explanation { + let fieldnorm_id = self.fieldnorm_id(); + let term_freq = self.term_freq(); + self.similarity_weight.explain(fieldnorm_id, term_freq) + } } #[cfg(test)] @@ -134,7 +143,7 @@ mod tests { use crate::index::SegmentId; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; - use crate::merge_policy::NoMergePolicy; + use crate::indexer::NoMergePolicy; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::query::term_query::TermScorer; use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery}; @@ -155,7 +164,7 @@ mod tests { crate::assert_nearly_equals!(max_scorer, 1.3990127); assert_eq!(term_scorer.doc(), 2); assert_eq!(term_scorer.term_freq(), 3); - assert_nearly_equals!(term_scorer.block_max_score(), 1.3676447); + assert_nearly_equals!(term_scorer.seek_block_max(2), 1.3676447); assert_nearly_equals!(term_scorer.score(), 1.0892314); assert_eq!(term_scorer.advance(), 3); assert_eq!(term_scorer.doc(), 3); @@ -170,9 +179,9 @@ mod tests { } #[test] - fn test_term_scorer_shallow_advance() -> crate::Result<()> { + fn test_term_scorer_shallow_advance() { let bm25_weight = Bm25Weight::for_one_term(300, 1024, 10.0); - let mut doc_and_tfs = vec![]; + let mut doc_and_tfs = Vec::new(); for i in 0u32..300u32 { let doc = i * 10; doc_and_tfs.push((doc, 1u32 + doc % 3u32)); @@ -180,11 +189,10 @@ mod tests { let fieldnorms: Vec = std::iter::repeat_n(10u32, 3_000).collect(); let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight); assert_eq!(term_scorer.doc(), 0u32); - term_scorer.seek_block(1289); + term_scorer.seek_block_max(1289); assert_eq!(term_scorer.doc(), 0u32); term_scorer.seek(1289); assert_eq!(term_scorer.doc(), 1290); - Ok(()) } proptest! { @@ -218,7 +226,7 @@ mod tests { let docs: Vec = (0..term_doc_freq).map(|doc| doc as DocId).collect(); for block in docs.chunks(COMPRESSION_BLOCK_SIZE) { - let block_max_score: Score = term_scorer.block_max_score(); + let block_max_score: Score = term_scorer.seek_block_max(0); let mut block_max_score_computed: Score = 0.0; for &doc in block { assert_eq!(term_scorer.doc(), doc); @@ -246,25 +254,26 @@ mod tests { let fieldnorms: Vec = std::iter::repeat_n(20u32, 300).collect(); let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0); let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight); - assert_nearly_equals!(docs.block_max_score(), 2.5161593); - docs.seek_block(135); - assert_nearly_equals!(docs.block_max_score(), 3.4597192); - docs.seek_block(256); + assert_nearly_equals!(docs.seek_block_max(0), 2.5161593); + assert_nearly_equals!(docs.seek_block_max(135), 3.4597192); // the block is not loaded yet. - assert_nearly_equals!(docs.block_max_score(), 5.2971773); + assert_nearly_equals!(docs.seek_block_max(256), 5.2971773); assert_eq!(256, docs.seek(256)); - assert_nearly_equals!(docs.block_max_score(), 3.9539647); + assert_nearly_equals!(docs.seek_block_max(256), 3.9539647); } - fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> { - let term_weight = - term_query.specialized_weight(EnableScoring::enabled_from_searcher(searcher))?; + fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) { + let term_weight = term_query + .specialized_weight(EnableScoring::enabled_from_searcher(searcher)) + .unwrap(); for reader in searcher.segment_readers() { let mut block_max_scores = vec![]; let mut block_max_scores_b = vec![]; let mut docs = vec![]; { - let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap(); + let mut term_scorer = term_weight + .term_scorer_for_test(reader.as_ref(), 1.0) + .unwrap(); while term_scorer.doc() != TERMINATED { let mut score = term_scorer.score(); docs.push(term_scorer.doc()); @@ -278,10 +287,12 @@ mod tests { } } { - let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap(); + let mut term_scorer = term_weight + .term_scorer_for_test(reader.as_ref(), 1.0) + .unwrap(); for d in docs { - term_scorer.seek_block(d); - block_max_scores_b.push(term_scorer.block_max_score()); + let block_max_score = term_scorer.seek_block_max(d); + block_max_scores_b.push(block_max_score); } } for (l, r) in block_max_scores @@ -292,18 +303,18 @@ mod tests { assert_nearly_equals!(l, r); } } - Ok(()) } #[ignore] #[test] - fn test_block_wand_long_test() -> crate::Result<()> { + fn test_block_wand_long_test() { let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut writer: IndexWriter = - index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?; + let mut writer: IndexWriter = index + .writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN) + .unwrap(); use rand::Rng; let mut rng = rand::rng(); writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -311,15 +322,15 @@ mod tests { let term_freq = rng.random_range(1..10000); let words: Vec<&str> = std::iter::repeat_n("bbbb", term_freq).collect(); let text = words.join(" "); - writer.add_document(doc!(text_field=>text))?; + writer.add_document(doc!(text_field=>text)).unwrap(); } - writer.commit()?; + writer.commit().unwrap(); let term_query = TermQuery::new( Term::from_field_text(text_field, "bbbb"), IndexRecordOption::WithFreqs, ); let segment_ids: Vec; - let reader = index.reader()?; + let reader = index.reader().unwrap(); { let searcher = reader.searcher(); segment_ids = searcher @@ -327,15 +338,14 @@ mod tests { .iter() .map(|segment| segment.segment_id()) .collect(); - test_block_wand_aux(&term_query, &searcher)?; + test_block_wand_aux(&term_query, &searcher); } writer.merge(&segment_ids[..]).wait().unwrap(); { - reader.reload()?; + reader.reload().unwrap(); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); - test_block_wand_aux(&term_query, &searcher)?; + test_block_wand_aux(&term_query, &searcher); } - Ok(()) } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 89b527cca..3dccdb7a2 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -1,12 +1,17 @@ -use super::term_scorer::TermScorer; +use std::io; + use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN}; use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; +use crate::postings::TermInfo; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; -use crate::query::weight::{for_each_docset_buffered, for_each_scorer}; -use crate::query::{AllScorer, AllWeight, EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::weight::for_each_docset_buffered; +use crate::query::{ + box_scorer, AllScorer, AllWeight, BoxedTermScorer, EmptyScorer, Explanation, Scorer, Weight, +}; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TantivyError, Term}; @@ -18,29 +23,58 @@ pub struct TermWeight { } enum TermOrEmptyOrAllScorer { - TermScorer(Box), + TermScorer(BoxedTermScorer), Empty, AllMatch(AllScorer), } +struct BuildTermScorer<'a> { + term_info: &'a TermInfo, + option: IndexRecordOption, + fieldnorm_reader: FieldNormReader, + similarity_weight: Bm25Weight, +} + +impl TypedInvertedIndexReaderCb> for BuildTermScorer<'_> { + fn call(&mut self, reader: &I) -> io::Result { + let postings = reader.read_postings_from_terminfo(self.term_info, self.option)?; + self.build_scorer(postings) + } +} + +impl BuildTermScorer<'_> { + fn build_scorer( + &self, + postings: TPostings, + ) -> io::Result { + let term_scorer = super::TermScorer::new( + postings, + self.fieldnorm_reader.clone(), + self.similarity_weight.clone(), + ); + Ok(BoxedTermScorer::new(term_scorer)) + } +} + impl TermOrEmptyOrAllScorer { pub fn into_boxed_scorer(self) -> Box { match self { - TermOrEmptyOrAllScorer::TermScorer(scorer) => scorer, - TermOrEmptyOrAllScorer::Empty => Box::new(EmptyScorer), - TermOrEmptyOrAllScorer::AllMatch(scorer) => Box::new(scorer), + TermOrEmptyOrAllScorer::TermScorer(scorer) => scorer.into_boxed_scorer(), + TermOrEmptyOrAllScorer::Empty => box_scorer(EmptyScorer), + TermOrEmptyOrAllScorer::AllMatch(scorer) => box_scorer(scorer), } } } impl Weight for TermWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { Ok(self.specialized_scorer(reader, boost)?.into_boxed_scorer()) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); if term_scorer.doc() > doc || term_scorer.seek(doc) != doc { return Err(does_not_match(doc)); } @@ -53,7 +87,7 @@ impl Weight for TermWeight { } } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { if let Some(alive_bitset) = reader.alive_bitset() { Ok(self.scorer(reader, 1.0)?.count(alive_bitset)) } else { @@ -68,16 +102,17 @@ impl Weight for TermWeight { /// `DocSet` and push the scored documents to the collector. fn for_each( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { - for_each_scorer(&mut *term_scorer, callback); + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); + term_scorer.for_each(callback); } TermOrEmptyOrAllScorer::Empty => {} TermOrEmptyOrAllScorer::AllMatch(mut all_scorer) => { - for_each_scorer(&mut all_scorer, callback); + all_scorer.for_each(callback); } } Ok(()) @@ -87,11 +122,12 @@ impl Weight for TermWeight { /// `DocSet` and push the scored documents to the collector. fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; for_each_docset_buffered(&mut term_scorer, &mut buffer, callback); } @@ -118,17 +154,13 @@ impl Weight for TermWeight { fn for_each_pruning( &self, threshold: Score, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score) -> Score, ) -> crate::Result<()> { let specialized_scorer = self.specialized_scorer(reader, 1.0)?; match specialized_scorer { TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { - crate::query::boolean_query::block_wand_single_scorer( - *term_scorer, - threshold, - callback, - ); + reader.for_each_pruning(threshold, term_scorer.into_boxed_scorer(), callback); } TermOrEmptyOrAllScorer::Empty => {} TermOrEmptyOrAllScorer::AllMatch(_) => { @@ -166,19 +198,25 @@ impl TermWeight { #[cfg(test)] pub(crate) fn term_scorer_for_test( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result> { - let scorer = self.specialized_scorer(reader, boost)?; - Ok(match scorer { - TermOrEmptyOrAllScorer::TermScorer(scorer) => Some(*scorer), + ) -> Option { + let scorer = self.specialized_scorer(reader, boost).unwrap(); + match scorer { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let term_scorer = term_scorer + .into_boxed_scorer() + .downcast::() + .ok()?; + Some(*term_scorer) + } _ => None, - }) + } } fn specialized_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result { let field = self.term.field(); @@ -196,22 +234,25 @@ impl TermWeight { ))); } - let segment_postings: SegmentPostings = - inverted_index.read_postings_from_terminfo(&term_info, self.index_record_option)?; - let fieldnorm_reader = self.fieldnorm_reader(reader)?; let similarity_weight = self.similarity_weight.boost_by(boost); - Ok(TermOrEmptyOrAllScorer::TermScorer(Box::new( - TermScorer::new(segment_postings, fieldnorm_reader, similarity_weight), - ))) + let mut term_scorer_builder = BuildTermScorer { + term_info: &term_info, + option: self.index_record_option, + fieldnorm_reader, + similarity_weight, + }; + let term_scorer = try_downcast_and_call(inverted_index.as_ref(), &mut term_scorer_builder)?; + + Ok(TermOrEmptyOrAllScorer::TermScorer(term_scorer)) } - fn fieldnorm_reader(&self, segment_reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader( + &self, + segment_reader: &dyn SegmentReader, + ) -> crate::Result { if self.scoring_enabled { - if let Some(field_norm_reader) = segment_reader - .fieldnorms_readers() - .get_field(self.term.field())? - { + if let Ok(field_norm_reader) = segment_reader.get_fieldnorms_reader(self.term.field()) { return Ok(field_norm_reader); } } diff --git a/src/query/union/bitset_union.rs b/src/query/union/bitset_union.rs index 8af1703ee..a7a326a98 100644 --- a/src/query/union/bitset_union.rs +++ b/src/query/union/bitset_union.rs @@ -1,7 +1,7 @@ use std::cell::RefCell; use crate::docset::DocSet; -use crate::postings::Postings; +use crate::postings::{DocFreq, Postings}; use crate::query::BitSetDocSet; use crate::DocId; @@ -16,6 +16,9 @@ pub struct BitSetPostingUnion { docsets: RefCell>, /// The already unionized BitSet of the docsets bitset: BitSetDocSet, + /// The total number of documents in the union (regardless of the position we are in the + /// bitset). + doc_freq: u32, } impl BitSetPostingUnion { @@ -23,9 +26,11 @@ impl BitSetPostingUnion { docsets: Vec, bitset: BitSetDocSet, ) -> BitSetPostingUnion { + let doc_freq = bitset.doc_freq(); BitSetPostingUnion { docsets: RefCell::new(docsets), bitset, + doc_freq, } } } @@ -46,6 +51,10 @@ impl Postings for BitSetPostingUnion { term_freq } + fn has_freq(&self) -> bool { + true + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { let curr_doc = self.bitset.doc(); let mut docsets = self.docsets.borrow_mut(); @@ -64,6 +73,10 @@ impl Postings for BitSetPostingUnion { output.sort_unstable(); output.dedup(); } + + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.doc_freq) + } } impl DocSet for BitSetPostingUnion { diff --git a/src/query/union/buffered_union.rs b/src/query/union/buffered_union.rs index e4cfe0ba3..b64612e94 100644 --- a/src/query/union/buffered_union.rs +++ b/src/query/union/buffered_union.rs @@ -31,7 +31,7 @@ where P: FnMut(&mut T) -> bool { /// Creates a `DocSet` that iterate through the union of two or more `DocSet`s. pub struct BufferedUnionScorer { /// Active scorers (already filtered of `TERMINATED`). - docsets: Vec, + scorers: Vec, /// Sliding window presence map for upcoming docs. /// /// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering @@ -46,6 +46,8 @@ pub struct BufferedUnionScorer { /// hit the same doc within the buffered window. scores: Box<[TScoreCombiner; HORIZON as usize]>, /// Start doc ID (inclusive) of the current sliding window. + /// None if the window is not loaded yet. This is true for a freshly created + /// BufferedUnionScorer. window_start_doc: DocId, /// Current doc ID of the union. doc: DocId, @@ -81,51 +83,81 @@ fn refill( } impl BufferedUnionScorer { + /// Returns the underlying scorers in the union. + pub fn into_scorers(self) -> Vec { + self.scorers + } + + /// Accessor for the underlying scorers in the union. + pub fn scorers(&self) -> &[TScorer] { + &self.scorers[..] + } + /// num_docs is the number of documents in the segment. pub(crate) fn build( docsets: Vec, score_combiner_fn: impl FnOnce() -> TScoreCombiner, num_docs: u32, ) -> BufferedUnionScorer { - let non_empty_docsets: Vec = docsets + let score_combiner = score_combiner_fn(); + let mut non_empty_docsets: Vec = docsets .into_iter() .filter(|docset| docset.doc() != TERMINATED) .collect(); - let mut union = BufferedUnionScorer { - docsets: non_empty_docsets, - bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), - scores: Box::new([score_combiner_fn(); HORIZON as usize]), - bucket_idx: HORIZON_NUM_TINYBITSETS, - window_start_doc: 0, - doc: 0, - score: 0.0, - num_docs, - }; - if union.refill() { - union.advance(); - } else { - union.doc = TERMINATED; + + let first_doc: DocId = non_empty_docsets + .iter() + .map(|docset| docset.doc()) + .min() + .unwrap_or(TERMINATED); + let mut score_combiner_cloned = score_combiner; + let mut i = 0; + while i < non_empty_docsets.len() { + let should_remove_docset: bool = { + let non_empty_docset = &mut non_empty_docsets[i]; + if non_empty_docset.doc() != first_doc { + false + } else { + score_combiner_cloned.update(non_empty_docset); + non_empty_docsets[i].advance() == TERMINATED + } + }; + if should_remove_docset { + non_empty_docsets.swap_remove(i); + } else { + i += 1; + } + } + let first_score: Score = score_combiner_cloned.score(); + BufferedUnionScorer { + scorers: non_empty_docsets, + bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), + scores: Box::new([score_combiner; HORIZON as usize]), + bucket_idx: HORIZON_NUM_TINYBITSETS, + // That way we will be detected as outside the window, + window_start_doc: u32::MAX - HORIZON, + doc: first_doc, + score: first_score, + num_docs, } - union } fn refill(&mut self) -> bool { - if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() { - // Reset the sliding window to start at the smallest doc - // across all scorers and prebuffer within the horizon. - self.window_start_doc = min_doc; - self.bucket_idx = 0; - self.doc = min_doc; - refill( - &mut self.docsets, - &mut self.bitsets, - &mut self.scores, - min_doc, - ); - true - } else { - false - } + let Some(min_doc) = self.scorers.iter().map(DocSet::doc).min() else { + return false; + }; + // Reset the sliding window to start at the smallest doc + // across all scorers and prebuffer within the horizon. + self.window_start_doc = min_doc; + self.bucket_idx = 0; + self.doc = min_doc; + refill( + &mut self.scorers, + &mut self.bitsets, + &mut self.scores, + min_doc, + ); + true } #[inline] @@ -147,6 +179,7 @@ impl BufferedUnionScorer bool { // wrapping_sub, because target may be < window_start_doc + // in particular during initialization. let gap = target.wrapping_sub(self.window_start_doc); gap < HORIZON } @@ -216,11 +249,10 @@ where if self.doc >= target { return self.doc; } - let gap = target - self.window_start_doc; - if gap < HORIZON { + if self.is_in_horizon(target) { // Our value is within the buffered horizon. - // Skipping to corresponding bucket. + let gap = target.wrapping_sub(self.window_start_doc); let new_bucket_idx = gap as usize / 64; for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] { obsolete_tinyset.clear(); @@ -239,16 +271,14 @@ where doc } else { // clear the buffered info. - for obsolete_tinyset in self.bitsets.iter_mut() { - *obsolete_tinyset = TinySet::empty(); - } + self.bitsets.fill(TinySet::empty()); for score_combiner in self.scores.iter_mut() { score_combiner.clear(); } // The target is outside of the buffered horizon. // advance all docsets to a doc >= to the target. - unordered_drain_filter(&mut self.docsets, |docset| { + unordered_drain_filter(&mut self.scorers, |docset| { if docset.doc() < target { docset.seek(target); } @@ -285,7 +315,7 @@ where let mut is_hit = false; let mut min_new_target = TERMINATED; - for docset in self.docsets.iter_mut() { + for docset in self.scorers.iter_mut() { match docset.seek_danger(target) { SeekDangerResult::Found => { is_hit = true; @@ -315,11 +345,11 @@ where } fn size_hint(&self) -> u32 { - estimate_union(self.docsets.iter().map(DocSet::size_hint), self.num_docs) + estimate_union(self.scorers.iter().map(DocSet::size_hint), self.num_docs) } fn cost(&self) -> u64 { - self.docsets.iter().map(|docset| docset.cost()).sum() + self.scorers.iter().map(|docset| docset.cost()).sum() } // TODO Also implement `count` with deletes efficiently. @@ -327,21 +357,17 @@ where if self.doc == TERMINATED { return 0; } - let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS] + let mut count = 1 + self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS] .iter() - .map(|bitset| bitset.len()) - .sum::() - + 1; - for bitset in self.bitsets.iter_mut() { - bitset.clear(); - } + .copied() + .map(TinySet::len) + .sum::(); while self.refill() { - count += self.bitsets.iter().map(|bitset| bitset.len()).sum::(); - for bitset in self.bitsets.iter_mut() { - bitset.clear(); - } + count += self.bitsets.iter().copied().map(TinySet::len).sum::(); + self.bitsets.fill(TinySet::empty()); } self.bucket_idx = HORIZON_NUM_TINYBITSETS; + self.doc = TERMINATED; count } } diff --git a/src/query/union/simple_union.rs b/src/query/union/simple_union.rs index b153a7f22..0457cae43 100644 --- a/src/query/union/simple_union.rs +++ b/src/query/union/simple_union.rs @@ -1,5 +1,5 @@ use crate::docset::{DocSet, TERMINATED}; -use crate::postings::Postings; +use crate::postings::{DocFreq, Postings}; use crate::DocId; /// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`. @@ -12,7 +12,11 @@ pub struct SimpleUnion { } impl SimpleUnion { - pub(crate) fn build(mut docsets: Vec) -> SimpleUnion { + /// Builds a `SimpleUnion` from multiple docsets. + /// + /// Exhausted docsets are filtered out, and the union is initialized at the + /// smallest current doc id across remaining docsets. + pub fn build(mut docsets: Vec) -> SimpleUnion { docsets.retain(|docset| docset.doc() != TERMINATED); let mut docset = SimpleUnion { docsets, doc: 0 }; @@ -56,6 +60,22 @@ impl Postings for SimpleUnion { term_freq } + fn has_freq(&self) -> bool { + true + } + + /// We do not know the actual document frequency, so we return + /// the maximum document frequency of the docsets. + fn doc_freq(&self) -> DocFreq { + let approximate_doc_freq = self + .docsets + .iter() + .map(|docset| u32::from(docset.doc_freq())) + .max() + .unwrap_or(0u32); + DocFreq::Approximate(approximate_doc_freq) + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { for docset in &mut self.docsets { let doc = docset.doc(); diff --git a/src/query/weight.rs b/src/query/weight.rs index 23ff55c04..2ad2d822e 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,21 +1,9 @@ use super::Scorer; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; +use crate::query::explanation::does_not_match; use crate::query::Explanation; -use crate::{DocId, DocSet, Score, TERMINATED}; - -/// Iterates through all of the documents and scores matched by the DocSet -/// `DocSet`. -pub(crate) fn for_each_scorer( - scorer: &mut TScorer, - callback: &mut dyn FnMut(DocId, Score), -) { - let mut doc = scorer.doc(); - while doc != TERMINATED { - callback(doc, scorer.score()); - doc = scorer.advance(); - } -} +use crate::{DocId, DocSet, Score}; /// Iterates through all of the documents matched by the DocSet /// `DocSet`. @@ -34,31 +22,6 @@ pub(crate) fn for_each_docset_buffered( } } -/// Calls `callback` with all of the `(doc, score)` for which score -/// is exceeding a given threshold. -/// -/// This method is useful for the [`TopDocs`](crate::collector::TopDocs) collector. -/// For all docsets, the blanket implementation has the benefit -/// of prefiltering (doc, score) pairs, avoiding the -/// virtual dispatch cost. -/// -/// More importantly, it makes it possible for scorers to implement -/// important optimization (e.g. BlockWAND for union). -pub(crate) fn for_each_pruning_scorer( - scorer: &mut TScorer, - mut threshold: Score, - callback: &mut dyn FnMut(DocId, Score) -> Score, -) { - let mut doc = scorer.doc(); - while doc != TERMINATED { - let score = scorer.score(); - if score > threshold { - threshold = callback(doc, score); - } - doc = scorer.advance(); - } -} - /// A Weight is the specialization of a `Query` /// for a given set of segments. /// @@ -69,13 +32,19 @@ pub trait Weight: Send + Sync + 'static { /// `boost` is a multiplier to apply to the score. /// /// See [`Query`](crate::query::Query). - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result>; + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result>; /// Returns an [`Explanation`] for the given document. - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result; + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.doc() > doc || scorer.seek(doc) != doc { + return Err(does_not_match(doc)); + } + Ok(scorer.explain()) + } /// Returns the number documents within the given [`SegmentReader`]. - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if let Some(alive_bitset) = reader.alive_bitset() { Ok(scorer.count(alive_bitset)) @@ -88,11 +57,11 @@ pub trait Weight: Send + Sync + 'static { /// `DocSet` and push the scored documents to the collector. fn for_each( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_scorer(scorer.as_mut(), callback); + scorer.for_each(callback); Ok(()) } @@ -100,7 +69,7 @@ pub trait Weight: Send + Sync + 'static { /// `DocSet` and push the scored documents to the collector. fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { let mut docset = self.scorer(reader, 1.0)?; @@ -123,11 +92,11 @@ pub trait Weight: Send + Sync + 'static { fn for_each_pruning( &self, threshold: Score, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score) -> Score, ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_pruning_scorer(scorer.as_mut(), threshold, callback); + scorer.for_each_pruning(threshold, callback); Ok(()) } } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 157e237d8..170653724 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -7,7 +7,7 @@ use arc_swap::ArcSwap; pub use warming::Warmer; use self::warming::WarmingState; -use crate::core::searcher::{SearcherGeneration, SearcherInner}; +use crate::core::searcher::{SearcherContext, SearcherGeneration, SearcherInner}; use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK}; use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject}; @@ -189,19 +189,28 @@ impl InnerIndexReader { /// /// This function acquires a lock to prevent GC from removing files /// as we are opening our index. - fn open_segment_readers(index: &Index) -> crate::Result> { + fn open_segment_readers(index: &Index) -> crate::Result>> { // Prevents segment files from getting deleted while we are in the process of opening them let _meta_lock = index.directory().acquire_lock(&META_LOCK)?; let searchable_segments = index.searchable_segments()?; let segment_readers = searchable_segments .iter() - .map(SegmentReader::open) + .map(|segment| { + let reader = + crate::TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + Ok(Arc::new(reader) as Arc) + }) .collect::>()?; Ok(segment_readers) } fn track_segment_readers_in_inventory( - segment_readers: &[SegmentReader], + segment_readers: &[Arc], searcher_generation_counter: &Arc, searcher_generation_inventory: &Inventory, ) -> TrackedObject { @@ -225,10 +234,9 @@ impl InnerIndexReader { searcher_generation_inventory, ); - let schema = index.schema(); + let context = SearcherContext::from_index(index); let searcher = Arc::new(SearcherInner::new( - schema, - index.clone(), + context, segment_readers, searcher_generation, doc_store_cache_num_blocks, diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index 915b685aa..c66dafe24 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -755,10 +755,9 @@ mod tests { doc.add_object(json_field, json_val); let schema = schema_builder.build(); - let json = doc.to_json(&schema); - let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap(); + let actual_json = doc.to_json(&schema); let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap(); - assert_eq!(actual_json["json"][0], expected_json); + assert_eq!(actual_json["json"], expected_json); } // TODO: Should this be re-added with the serialize method diff --git a/src/schema/document/mod.rs b/src/schema/document/mod.rs index 8168ee811..eeab563ac 100644 --- a/src/schema/document/mod.rs +++ b/src/schema/document/mod.rs @@ -247,10 +247,35 @@ pub trait Document: Send + Sync + 'static { /// Encode the doc in JSON. /// /// Encoding a document cannot fail. - fn to_json(&self, schema: &Schema) -> String { + fn to_serialized_json(&self, schema: &Schema) -> String { serde_json::to_string(&self.to_named_doc(schema)) .expect("doc encoding failed. This is a bug") } + + /// Encode the doc in JSON. + /// + /// Encoding a document cannot fail. + /// + /// It will automatically flatten arrays of length 1 to just the value, and it will + /// automatically flatten objects of length 1 to just the value. + fn to_json(&self, schema: &Schema) -> serde_json::Value { + let mut json_value = serde_json::Value::Object(serde_json::Map::new()); + for (field, field_values) in self.get_sorted_field_values() { + let field_name = schema.get_field_name(field); + let values: Vec = field_values + .into_iter() + .map(|val| OwnedValue::from(val.as_value())) + .collect(); + if values.len() == 1 { + json_value[field_name] = + serde_json::to_value(&values[0]).expect("doc encoding failed. This is a bug"); + } else { + json_value[field_name] = + serde_json::to_value(&values).expect("doc encoding failed. This is a bug"); + } + } + json_value + } } pub(crate) mod type_codes { diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 49a6b1ac7..f4f95bafa 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -475,8 +475,11 @@ mod tests { let schema = schema_builder.build(); let mut doc = TantivyDocument::default(); doc.add_bytes(bytes_field, "this is a test".as_bytes()); - let json_string = doc.to_json(&schema); - assert_eq!(json_string, r#"{"my_bytes":["dGhpcyBpcyBhIHRlc3Q="]}"#); + let json_value = doc.to_json(&schema); + assert_eq!( + json_value, + serde_json::json!({"my_bytes": "dGhpcyBpcyBhIHRlc3Q="}) + ); } #[test] @@ -487,9 +490,8 @@ mod tests { let schema = schema_builder.build(); let mut doc = TantivyDocument::default(); doc.add_bytes(bytes_field, "".as_bytes()); - let json_string = doc.to_json(&schema); - - assert_eq!(json_string, r#"{"my_bytes":[""]}"#); + let json_value = doc.to_json(&schema); + assert_eq!(json_value, serde_json::json!({"my_bytes": ""})); } #[test] @@ -503,10 +505,12 @@ mod tests { bytes_field, "A bigger test I guess\nspanning on multiple lines\nhoping this will work".as_bytes(), ); - let json_string = doc.to_json(&schema); + let json_value = doc.to_json(&schema); assert_eq!( - json_string, - r#"{"my_bytes":["QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw=="]}"# + json_value, + serde_json::json!({ + "my_bytes": "QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw==" + }) ); } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 5c54c956c..8e1251b0f 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -702,7 +702,10 @@ mod tests { let date_time = PrimitiveDateTime::new(naive_date, naive_time); doc.add_date(date_field, DateTime::from_primitive(date_time)); let doc_json = doc.to_json(&schema); - assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#); + assert_eq!( + doc_json, + serde_json::json!({"date": "1982-09-17T13:20:00Z"}) + ); } #[test] diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index d3adf85a3..8b39dde64 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; /// * describe in the schema the amount of information that should be retained during indexing (See /// [`TextFieldIndexing::set_index_option()`](crate::schema::TextFieldIndexing::set_index_option)) /// * request that a given amount of information to be decoded as one goes through a posting list. -/// (See [`InvertedIndexReader::read_postings()`](crate::InvertedIndexReader::read_postings)) +/// (See [`DynInvertedIndexReader::read_postings()`](crate::DynInvertedIndexReader::read_postings)) #[derive( Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize, Default, )] diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c1d22c0ba..c9468f075 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -590,7 +590,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let doc_serdeser = TantivyDocument::parse_json(&schema, &doc.to_json(&schema)).unwrap(); + let doc_serdeser = + TantivyDocument::parse_json(&schema, &doc.to_serialized_json(&schema)).unwrap(); assert_eq!(doc, doc_serdeser); } @@ -605,8 +606,8 @@ mod tests { "ip": "127.0.0.1" }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "127.0.0.1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "127.0.0.1"); // Special case IpV6 loopback. We don't want to map that to IPv4 let doc_json = r#"{ @@ -614,8 +615,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "::1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "::1"); // testing ip address of every router in the world let doc_json = r#"{ @@ -623,8 +624,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "192.168.0.1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "192.168.0.1"); } #[test] diff --git a/src/schema/term.rs b/src/schema/term.rs index e1e4f02e4..24a53844b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -135,7 +135,7 @@ impl Term { /// Use `clear_with_field_and_type` in that case. /// /// Sets field and the type. - pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) { + pub fn set_field_and_type(&mut self, field: Field, typ: Type) { assert!(self.is_empty()); self.field = field; self.serialized_value_bytes[0] = typ.to_code(); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index ee61b534a..4b5867bc9 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -422,7 +422,7 @@ impl SnippetGenerator { terms_text.insert(term_str.to_string(), score); } } - let tokenizer = searcher.index().tokenizer_for_field(field)?; + let tokenizer = searcher.tokenizer_for_field(field)?; Ok(SnippetGenerator { terms_text, tokenizer, diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 4c1f9c76a..3bd3e70c4 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -34,7 +34,8 @@ pub struct SearcherSpaceUsage { } impl SearcherSpaceUsage { - pub(crate) fn new() -> SearcherSpaceUsage { + /// Creates an empty searcher space-usage accumulator. + pub fn new() -> SearcherSpaceUsage { SearcherSpaceUsage { segments: Vec::new(), total: Default::default(), @@ -80,7 +81,8 @@ pub struct SegmentSpaceUsage { impl SegmentSpaceUsage { #[expect(clippy::too_many_arguments)] - pub(crate) fn new( + /// Creates a segment space-usage summary from all major segment components. + pub fn new( num_docs: u32, termdict: PerFieldSpaceUsage, postings: PerFieldSpaceUsage, @@ -210,7 +212,7 @@ impl StoreSpaceUsage { /// /// A field can appear with a single index (typically 0) or with multiple indexes. /// Multiple indexes are used to handle variable length things, where -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, Default)] pub struct PerFieldSpaceUsage { fields: BTreeMap, total: ByteCount, diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 13c252e92..80e15b8e9 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -48,7 +48,7 @@ mod tests { use crate::indexer::NoMergePolicy; use crate::schema::{SchemaBuilder, STORED, TEXT}; use crate::store::index::Checkpoint; - use crate::{DocAddress, DocId, Index, IndexWriter, TantivyDocument, Term}; + use crate::{DocAddress, DocId, Index, IndexWriter, Term}; #[test] fn test_skip_index_empty() -> io::Result<()> { @@ -149,7 +149,7 @@ mod tests { let searcher = reader.searcher(); assert_eq!(searcher.num_docs(), 30); for i in 0..searcher.num_docs() as u32 { - let _doc = searcher.doc::(DocAddress::new(0u32, i))?; + let _doc = searcher.doc(DocAddress::new(0u32, i))?; } Ok(()) } diff --git a/src/store/mod.rs b/src/store/mod.rs index cccf4d8f9..15dafb926 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -22,7 +22,7 @@ //! //! # Usage //! -//! Most users should not access the `StoreReader` directly +//! Most users should not access the `TantivyStoreReader` directly //! and should rely on either //! //! - at the segment level, the [`SegmentReader`'s `doc` @@ -38,7 +38,7 @@ mod writer; pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::decompressors::Decompressor; -pub use self::reader::{CacheStats, StoreReader}; +pub use self::reader::{CacheStats, StoreReader, TantivyStoreReader}; pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY}; pub use self::writer::StoreWriter; mod store_compressor; @@ -117,11 +117,11 @@ pub(crate) mod tests { write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::default(), BLOCK_SIZE, true); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, 10)?; + let store = TantivyStoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( store - .get::(i)? + .get(i)? .get_first(field_title) .unwrap() .as_value() @@ -169,11 +169,11 @@ pub(crate) mod tests { write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize, separate_thread); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, 10)?; + let store = TantivyStoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( *store - .get::(i)? + .get(i)? .get_first(field_title) .unwrap() .as_str() @@ -247,9 +247,10 @@ pub(crate) mod tests { let searcher = index.reader()?.searcher(); let reader = searcher.segment_reader(0); let store = reader.get_store_reader(10)?; - for doc in store.iter::(reader.alive_bitset()) { + for doc_id in reader.doc_ids_alive() { + let doc = store.get(doc_id)?; assert_eq!( - *doc?.get_first(text_field).unwrap().as_str().unwrap(), + *doc.get_first(text_field).unwrap().as_str().unwrap(), "deletemenot".to_string() ); } @@ -280,13 +281,6 @@ pub(crate) mod tests { } assert!(index_writer.commit().is_ok()); } - assert_eq!( - index.reader().unwrap().searcher().segment_readers()[0] - .get_store_reader(10) - .unwrap() - .decompressor(), - Decompressor::Lz4 - ); // Change compressor, this disables stacking on merging let index_settings = index.settings_mut(); index_settings.docstore_compression = Compressor::Zstd(Default::default()); @@ -305,17 +299,13 @@ pub(crate) mod tests { let reader = searcher.segment_readers().iter().last().unwrap(); let store = reader.get_store_reader(10).unwrap(); - for doc in store - .iter::(reader.alive_bitset()) - .take(50) - { + for doc_id in reader.doc_ids_alive().take(50) { + let doc = store.get(doc_id)?; assert_eq!( - *doc?.get_first(text_field).and_then(|v| v.as_str()).unwrap(), + *doc.get_first(text_field).and_then(|v| v.as_str()).unwrap(), LOREM.to_string() ); } - assert_eq!(store.decompressor(), Decompressor::Zstd); - Ok(()) } @@ -354,7 +344,12 @@ pub(crate) mod tests { assert_eq!(searcher.segment_readers().len(), 1); let reader = searcher.segment_readers().iter().last().unwrap(); let store = reader.get_store_reader(10)?; - assert_eq!(store.block_checkpoints().count(), 1); + let mut num_docs = 0; + for doc_id in reader.doc_ids_alive() { + store.get(doc_id)?; + num_docs += 1; + } + assert_eq!(num_docs, 5); Ok(()) } } @@ -368,7 +363,7 @@ mod bench { use super::tests::write_lorem_ipsum_store; use crate::directory::{Directory, RamDirectory}; - use crate::store::{Compressor, StoreReader}; + use crate::store::{Compressor, TantivyStoreReader}; use crate::TantivyDocument; #[bench] @@ -400,7 +395,7 @@ mod bench { true, ); let store_file = directory.open_read(path).unwrap(); - let store = StoreReader::open(store_file, 10).unwrap(); + let store = TantivyStoreReader::open(store_file, 10).unwrap(); b.iter(|| store.iter::(None).collect::>()); } } diff --git a/src/store/reader.rs b/src/store/reader.rs index a4105abec..75824d980 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -5,17 +5,20 @@ use std::num::NonZeroUsize; use std::ops::{AddAssign, Range}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; +#[cfg(feature = "quickwit")] +use std::{future::Future, pin::Pin}; use common::{BinarySerializable, OwnedBytes}; use lru::LruCache; use super::footer::DocStoreFooter; use super::index::SkipIndex; -use super::Decompressor; +use super::{Compressor, Decompressor, StoreWriter}; use crate::directory::FileSlice; use crate::error::DataCorruption; use crate::fastfield::AliveBitSet; use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize}; +use crate::schema::TantivyDocument; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; @@ -26,9 +29,33 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; type Block = OwnedBytes; +/// Object-safe API for reading documents from the store. +pub trait StoreReader: Send + Sync { + /// Reads and deserializes a given document. + fn get(&self, doc_id: DocId) -> crate::Result; + + /// Returns the cache hit and miss statistics of this reader. + fn cache_stats(&self) -> CacheStats; + + /// Merges this store into `store_writer`, filtering deletes via `alive_bitset`. + fn merge_into( + &self, + store_writer: &mut StoreWriter, + alive_bitset: Option<&AliveBitSet>, + ) -> crate::Result<()>; + + /// Fetches a document asynchronously. + #[cfg(feature = "quickwit")] + fn get_async<'a>( + &'a self, + doc_id: DocId, + executor: &'a Executor, + ) -> Pin> + 'a>>; +} + /// The format version of the document store. #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] -pub(crate) enum DocStoreVersion { +pub enum DocStoreVersion { V1 = 1, V2 = 2, } @@ -60,7 +87,7 @@ impl BinarySerializable for DocStoreVersion { } /// Reads document off tantivy's [`Store`](./index.html) -pub struct StoreReader { +pub struct TantivyStoreReader { decompressor: Decompressor, doc_store_version: DocStoreVersion, data: FileSlice, @@ -119,7 +146,7 @@ impl BlockCache { } #[derive(Debug, Default)] -/// CacheStats for the `StoreReader`. +/// CacheStats for the `TantivyStoreReader`. pub struct CacheStats { /// The number of entries in the cache pub num_entries: usize, @@ -149,12 +176,12 @@ impl Sum for CacheStats { } } -impl StoreReader { +impl TantivyStoreReader { /// Opens a store reader /// /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU. /// The size of blocks is configurable, this should be reflexted in the - pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result { + pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result { let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?; let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize); @@ -162,7 +189,7 @@ impl StoreReader { let space_usage = StoreSpaceUsage::new(data_file.num_bytes(), offset_index_file.num_bytes()); let skip_index = SkipIndex::open(index_data); - Ok(StoreReader { + Ok(TantivyStoreReader { decompressor: footer.decompressor, doc_store_version: footer.doc_store_version, data: data_file, @@ -177,14 +204,10 @@ impl StoreReader { }) } - pub(crate) fn block_checkpoints(&self) -> impl Iterator + '_ { + fn block_checkpoints(&self) -> impl Iterator + '_ { self.skip_index.checkpoints() } - pub(crate) fn decompressor(&self) -> Decompressor { - self.decompressor - } - /// Returns the cache hit and miss statistics of the store reader. pub(crate) fn cache_stats(&self) -> CacheStats { self.cache.stats() @@ -204,6 +227,26 @@ impl StoreReader { self.data.read_bytes() } + fn can_stack_for_merge(&self, target_compressor: Compressor) -> bool { + // If there is not enough data in the store, we avoid stacking in order to + // avoid creating many small blocks in the doc store. + // https://github.com/quickwit-oss/tantivy/issues/1053 + const MIN_BLOCKS_REQUIRED: usize = 6; + self.decompressor == target_compressor.into() + && self + .block_checkpoints() + // to not count all blocks in the store + .take(MIN_BLOCKS_REQUIRED + 1) + .count() + >= MIN_BLOCKS_REQUIRED + } + + fn block_ranges(&self) -> Vec<(Range, Range)> { + self.block_checkpoints() + .map(|checkpoint| (checkpoint.doc_range, checkpoint.byte_range)) + .collect() + } + fn get_compressed_block(&self, checkpoint: &Checkpoint) -> io::Result { self.data.slice(checkpoint.byte_range.clone()).read_bytes() } @@ -236,25 +279,15 @@ impl StoreReader { /// /// It should not be called to score documents /// for instance. - pub fn get(&self, doc_id: DocId) -> crate::Result { - let mut doc_bytes = self.get_document_bytes(doc_id)?; + pub fn get(&self, doc_id: DocId) -> crate::Result { + let checkpoint = self.block_checkpoint(doc_id)?; + let block = self.read_block(&checkpoint)?; + let mut doc_bytes = Self::get_document_bytes_from_block(block, doc_id, &checkpoint)?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) .map_err(crate::TantivyError::from)?; - D::deserialize(deserializer).map_err(crate::TantivyError::from) - } - - /// Returns raw bytes of a given document. - /// - /// Calling `.get(doc)` is relatively costly as it requires - /// decompressing a compressed block. The store utilizes a LRU cache, - /// so accessing docs from the same compressed block should be faster. - /// For that reason a store reader should be kept and reused. - pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result { - let checkpoint = self.block_checkpoint(doc_id)?; - let block = self.read_block(&checkpoint)?; - Self::get_document_bytes_from_block(block, doc_id, &checkpoint) + TantivyDocument::deserialize(deserializer).map_err(crate::TantivyError::from) } /// Advanced API. @@ -354,6 +387,44 @@ impl StoreReader { } } +impl StoreReader for TantivyStoreReader { + fn get(&self, doc_id: DocId) -> crate::Result { + TantivyStoreReader::get(self, doc_id) + } + + fn cache_stats(&self) -> CacheStats { + TantivyStoreReader::cache_stats(self) + } + + fn merge_into( + &self, + store_writer: &mut StoreWriter, + alive_bitset: Option<&AliveBitSet>, + ) -> crate::Result<()> { + if alive_bitset.is_some() || !self.can_stack_for_merge(store_writer.compressor()) { + for doc_bytes_res in self.iter_raw(alive_bitset) { + let doc_bytes = doc_bytes_res?; + store_writer.store_bytes(&doc_bytes)?; + } + Ok(()) + } else { + let block_data = self.block_data()?; + let block_ranges = self.block_ranges(); + store_writer.stack_parts(block_data, block_ranges)?; + Ok(()) + } + } + + #[cfg(feature = "quickwit")] + fn get_async<'a>( + &'a self, + doc_id: DocId, + executor: &'a Executor, + ) -> Pin> + 'a>> { + Box::pin(TantivyStoreReader::get_async(self, doc_id, executor)) + } +} + fn block_read_index(block: &[u8], doc_pos: u32) -> crate::Result> { let doc_pos = doc_pos as usize; let size_of_u32 = std::mem::size_of::(); @@ -377,7 +448,7 @@ fn block_read_index(block: &[u8], doc_pos: u32) -> crate::Result> { } #[cfg(feature = "quickwit")] -impl StoreReader { +impl TantivyStoreReader { /// Advanced API. /// /// In most cases use [`get_async`](Self::get_async) @@ -413,7 +484,7 @@ impl StoreReader { } /// Reads raw bytes of a given document asynchronously. - pub async fn get_document_bytes_async( + async fn get_document_bytes_async( &self, doc_id: DocId, executor: &Executor, @@ -424,17 +495,17 @@ impl StoreReader { } /// Fetches a document asynchronously. Async version of [`get`](Self::get). - pub async fn get_async( + pub async fn get_async( &self, doc_id: DocId, executor: &Executor, - ) -> crate::Result { + ) -> crate::Result { let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) .map_err(crate::TantivyError::from)?; - D::deserialize(deserializer).map_err(crate::TantivyError::from) + TantivyDocument::deserialize(deserializer).map_err(crate::TantivyError::from) } } @@ -468,7 +539,7 @@ mod tests { let schema = write_lorem_ipsum_store(writer, 500, Compressor::None, BLOCK_SIZE, true); let title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, DOCSTORE_CACHE_CAPACITY)?; + let store = TantivyStoreReader::open(store_file, DOCSTORE_CACHE_CAPACITY)?; assert_eq!(store.cache.len(), 0); assert_eq!(store.cache_stats().cache_hits, 0); diff --git a/src/store/store_compressor.rs b/src/store/store_compressor.rs index 20211b25a..10f78f1ea 100644 --- a/src/store/store_compressor.rs +++ b/src/store/store_compressor.rs @@ -1,15 +1,16 @@ use std::io::Write; +use std::ops::Range; use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; use std::thread::JoinHandle; use std::{io, thread}; -use common::{BinarySerializable, CountingWriter, TerminatingWrite}; +use common::{BinarySerializable, CountingWriter, OwnedBytes, TerminatingWrite}; use super::DOC_STORE_VERSION; use crate::directory::WritePtr; use crate::store::footer::DocStoreFooter; use crate::store::index::{Checkpoint, SkipIndexBuilder}; -use crate::store::{Compressor, Decompressor, StoreReader}; +use crate::store::{Compressor, Decompressor}; use crate::DocId; pub struct BlockCompressor(BlockCompressorVariants); @@ -54,16 +55,19 @@ impl BlockCompressor { Ok(()) } - pub fn stack_reader(&mut self, store_reader: StoreReader) -> io::Result<()> { + pub fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { match &mut self.0 { BlockCompressorVariants::SameThread(block_compressor) => { - block_compressor.stack(store_reader)?; + block_compressor.stack_parts(block_data, block_ranges) } BlockCompressorVariants::DedicatedThread(different_thread_block_compressor) => { - different_thread_block_compressor.stack_reader(store_reader)?; + different_thread_block_compressor.stack_parts(block_data, block_ranges) } } - Ok(()) } pub fn close(self) -> io::Result<()> { @@ -122,22 +126,24 @@ impl BlockCompressorImpl { /// This method is an optimization compared to iterating over the documents /// in the store and adding them one by one, as the store's data will /// not be decompressed and then recompressed. - fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { + fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { let doc_shift = self.first_doc_in_block; let start_shift = self.writer.written_bytes() as usize; // just bulk write all of the block of the given reader. - self.writer - .write_all(store_reader.block_data()?.as_slice())?; + self.writer.write_all(block_data.as_slice())?; // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. - for mut checkpoint in store_reader.block_checkpoints() { - checkpoint.doc_range.start += doc_shift; - checkpoint.doc_range.end += doc_shift; - checkpoint.byte_range.start += start_shift; - checkpoint.byte_range.end += start_shift; - self.register_checkpoint(checkpoint); + for (doc_range, byte_range) in block_ranges { + self.register_checkpoint(Checkpoint { + doc_range: (doc_range.start + doc_shift)..(doc_range.end + doc_shift), + byte_range: (byte_range.start + start_shift)..(byte_range.end + start_shift), + }); } Ok(()) } @@ -161,7 +167,10 @@ enum BlockCompressorMessage { block_data: Vec, num_docs_in_block: u32, }, - Stack(StoreReader), + Stack { + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + }, } struct DedicatedThreadBlockCompressorImpl { @@ -187,8 +196,11 @@ impl DedicatedThreadBlockCompressorImpl { block_compressor .compress_block_and_write(&block_data[..], num_docs_in_block)?; } - BlockCompressorMessage::Stack(store_reader) => { - block_compressor.stack(store_reader)?; + BlockCompressorMessage::Stack { + block_data, + block_ranges, + } => { + block_compressor.stack_parts(block_data, block_ranges)?; } } } @@ -208,8 +220,15 @@ impl DedicatedThreadBlockCompressorImpl { }) } - fn stack_reader(&mut self, store_reader: StoreReader) -> io::Result<()> { - self.send(BlockCompressorMessage::Stack(store_reader)) + fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { + self.send(BlockCompressorMessage::Stack { + block_data, + block_ranges, + }) } fn send(&mut self, msg: BlockCompressorMessage) -> io::Result<()> { diff --git a/src/store/writer.rs b/src/store/writer.rs index ef514accc..bb4be2484 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,10 +1,12 @@ use std::io; +use std::ops::Range; +use std::sync::Arc; -use common::BinarySerializable; +use common::{BinarySerializable, OwnedBytes}; use super::compressors::Compressor; -use super::StoreReader; use crate::directory::WritePtr; +use crate::index::SegmentReader; use crate::schema::document::{BinaryDocumentSerializer, Document}; use crate::schema::Schema; use crate::store::store_compressor::BlockCompressor; @@ -119,14 +121,24 @@ impl StoreWriter { Ok(()) } - /// Stacks a store reader on top of the documents written so far. - /// This method is an optimization compared to iterating over the documents - /// in the store and adding them one by one, as the store's data will - /// not be decompressed and then recompressed. - pub fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { - // We flush the current block first before stacking + pub(crate) fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { self.send_current_block_to_compressor()?; - self.block_compressor.stack_reader(store_reader)?; + self.block_compressor.stack_parts(block_data, block_ranges) + } + + pub(crate) fn merge_segment_readers( + &mut self, + segment_readers: &[Arc], + ) -> crate::Result<()> { + const MERGE_DOCSTORE_CACHE_NUM_BLOCKS: usize = 1; + for segment_reader in segment_readers { + let store_reader = segment_reader.get_store_reader(MERGE_DOCSTORE_CACHE_NUM_BLOCKS)?; + store_reader.merge_into(self, segment_reader.alive_bitset())?; + } Ok(()) }