diff --git a/Cargo.toml b/Cargo.toml index ee308b842..1815edacb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -201,3 +201,8 @@ harness = false [[bench]] name = "regex_all_terms" harness = false + +[[bench]] +name = "fill_bitset" +harness = false + diff --git a/benches/fill_bitset.rs b/benches/fill_bitset.rs new file mode 100644 index 000000000..0f2c7ea53 --- /dev/null +++ b/benches/fill_bitset.rs @@ -0,0 +1,106 @@ +use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use common::BitSet; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy::postings::BlockSegmentPostings; +use tantivy::schema::*; +use tantivy::{doc, DocSet as _, Index, InvertedIndexReader as _, TantivyDocument}; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +fn main() { + let index = build_test_index(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = &searcher.segment_readers()[0]; + let text_field = index.schema().get_field("text").unwrap(); + let inverted_index = segment_reader.inverted_index(text_field).unwrap(); + let max_doc = segment_reader.max_doc(); + + let term = Term::from_field_text(text_field, "hello"); + let term_info = inverted_index.get_term_info(&term).unwrap().unwrap(); + + let mut runner = BenchRunner::new(); + runner.set_name("fill_bitset"); + + let mut group = runner.new_group(); + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This is the path used by queries (AutomatonWeight, RangeQuery, etc.) + // It dispatches via DynInvertedIndexReader::fill_bitset_from_terminfo. + group.register("fill_bitset_from_terminfo (via trait)", move |_| { + let mut bitset = BitSet::with_max_value(max_doc); + inverted_index + .fill_bitset_from_terminfo(term_info, &mut bitset) + .unwrap(); + black_box(bitset); + }); + } + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This constructs a SegmentPostings via read_docset_from_terminfo and calls fill_bitset. + group.register("read_docset + fill_bitset", move |_| { + let mut postings = inverted_index.read_docset_from_terminfo(term_info).unwrap(); + let mut bitset = BitSet::with_max_value(max_doc); + postings.fill_bitset(&mut bitset); + black_box(bitset); + }); + } + { + let inverted_index = &inverted_index; + let term_info = &term_info; + // This uses BlockSegmentPostings directly, bypassing SegmentPostings entirely. + group.register("BlockSegmentPostings direct", move |_| { + let raw = inverted_index + .read_raw_postings_data(term_info, IndexRecordOption::Basic) + .unwrap(); + let mut block_postings = BlockSegmentPostings::open( + term_info.doc_freq, + raw.postings_data, + raw.record_option, + raw.effective_option, + ) + .unwrap(); + let mut bitset = BitSet::with_max_value(max_doc); + loop { + let docs = block_postings.docs(); + if docs.is_empty() { + break; + } + for &doc in docs { + bitset.insert(doc); + } + block_postings.advance(); + } + black_box(bitset); + }); + } + group.run(); +} + +fn build_test_index() -> Index { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let text_field = schema.get_field("text").unwrap(); + + let mut writer = index.writer::(250_000_000).unwrap(); + let mut rng = StdRng::from_seed([42u8; 32]); + for _ in 0..100_000 { + if rng.random_bool(0.5) { + writer + .add_document(doc!(text_field => "hello world")) + .unwrap(); + } else { + writer + .add_document(doc!(text_field => "goodbye world")) + .unwrap(); + } + } + writer.commit().unwrap(); + index +} diff --git a/benches/str_search_and_get.rs b/benches/str_search_and_get.rs index ffb9768cd..d6daa7852 100644 --- a/benches/str_search_and_get.rs +++ b/benches/str_search_and_get.rs @@ -17,7 +17,6 @@ use rand::rngs::StdRng; use rand::SeedableRng; use tantivy::collector::{Count, DocSetCollector}; use tantivy::query::RangeQuery; -use tantivy::schema::document::TantivyDocument; use tantivy::schema::{Schema, Value, FAST, STORED, STRING}; use tantivy::{doc, Index, ReloadPolicy, Searcher, Term}; @@ -406,7 +405,7 @@ impl FetchAllStringsFromDocTask { for doc_address in docs { // Get the document from the doc store (row store access) - if let Ok(doc) = self.searcher.doc::(doc_address) { + if let Ok(doc) = self.searcher.doc(doc_address) { // Extract string values from the stored field if let Some(field_value) = doc.get_first(str_stored_field) { if let Some(text) = field_value.as_value().as_str() { diff --git a/common/src/bitset.rs b/common/src/bitset.rs index cf719e53a..aa2021165 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -193,6 +193,8 @@ impl TinySet { #[derive(Clone)] pub struct BitSet { tinysets: Box<[TinySet]>, + // Tracking `len` on every insert/remove adds overhead even when `len()` is never called. + // Consider removing if `len()` usage is rare or not on a hot path. len: u64, max_value: u32, } @@ -252,6 +254,7 @@ impl BitSet { /// Removes all elements from the `BitSet`. pub fn clear(&mut self) { + self.len = 0; for tinyset in self.tinysets.iter_mut() { *tinyset = TinySet::empty(); } @@ -271,6 +274,11 @@ impl BitSet { } } + /// Estimate the heap memory consumption of this `BitSet` in bytes. + pub fn get_memory_consumption(&self) -> usize { + self.tinysets.len() * std::mem::size_of::() + } + /// Returns the number of elements in the `BitSet`. #[inline] pub fn len(&self) -> usize { @@ -314,6 +322,9 @@ impl BitSet { .map(|delta_bucket| bucket + delta_bucket as u32) } + /// Returns the maximum number of elements in the bitset. + /// + /// Warning: The largest element the bitset can contain is `max_value - 1`. #[inline] pub fn max_value(&self) -> u32 { self.max_value diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 29b606930..355e134ba 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -70,7 +70,7 @@ impl Collector for StatsCollector { fn for_segment( &self, _segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> tantivy::Result { let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?; Ok(StatsSegmentCollector { diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs index a5da06c9c..530e5ddf1 100644 --- a/examples/date_time_field.rs +++ b/examples/date_time_field.rs @@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> { let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?; assert_eq!(count_docs.len(), 1); for (_score, doc_address) in count_docs { - let retrieved_doc = searcher.doc::(doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; assert!(retrieved_doc .get_first(occurred_at) .unwrap() diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index d21a1c3d4..84eeb060b 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> { ); let top_docs_by_custom_score = // Call TopDocs with a custom tweak score - TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| { + TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| { let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap(); let facet_dict = ingredient_reader.facet_dict(); @@ -91,7 +91,7 @@ fn main() -> tantivy::Result<()> { .iter() .map(|(_, doc_id)| { searcher - .doc::(*doc_id) + .doc(*doc_id) .unwrap() .get_first(title) .and_then(|v| v.as_str().map(|el| el.to_string())) diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index 36bc4371c..abee516cd 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> { } } - // A `Term` is a text token associated with a field. - // Let's go through all docs containing the term `title:the` and access their position - let term_the = Term::from_field_text(title, "the"); - - // Some other powerful operations (especially `.skip_to`) may be useful to consume these + // Some other powerful operations (especially `.seek`) may be useful to consume these // posting lists rapidly. // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait - // Also, for some VERY specific high performance use case like an OLAP analysis of logs, - // you can get better performance by accessing directly the blocks of doc ids. - for segment_reader in searcher.segment_readers() { - // A segment contains different data structure. - // Inverted index stands for the combination of - // - the term dictionary - // - the inverted lists associated with each terms and their positions - let inverted_index = segment_reader.inverted_index(title)?; - - // This segment posting object is like a cursor over the documents matching the term. - // The `IndexRecordOption` arguments tells tantivy we will be interested in both term - // frequencies and positions. - // - // If you don't need all this information, you may get better performance by decompressing - // less information. - if let Some(mut block_segment_postings) = - inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)? - { - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - // Once again these docs MAY contains deleted documents as well. - let docs = block_segment_postings.docs(); - // Prints `Docs [0, 2].` - println!("Docs {docs:?}"); - block_segment_postings.advance(); - } - } - } - Ok(()) } diff --git a/examples/phrase_prefix_search.rs b/examples/phrase_prefix_search.rs index e2e1922cb..81b754fa5 100644 --- a/examples/phrase_prefix_search.rs +++ b/examples/phrase_prefix_search.rs @@ -67,7 +67,7 @@ fn main() -> Result<()> { let mut titles = top_docs .into_iter() .map(|(_score, doc_address)| { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let title = doc .get_first(title) .and_then(|v| v.as_str()) diff --git a/examples/snippet.rs b/examples/snippet.rs index 04edee82f..dd5e55d57 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -55,7 +55,7 @@ fn main() -> tantivy::Result<()> { let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; for (score, doc_address) in top_docs { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let snippet = snippet_generator.snippet_from_doc(&doc); println!("Document score {score}:"); println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap()); diff --git a/examples/warmer.rs b/examples/warmer.rs index c7543114a..53f2f5ffa 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -43,7 +43,7 @@ impl DynamicPriceColumn { } } - pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option>> { + pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option>> { let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp()); self.price_cache.read().unwrap().get(&segment_key).cloned() } @@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> { let query = query_parser.parse_query("cooking")?; let searcher = reader.searcher(); - let score_by_price = move |segment_reader: &SegmentReader| { + let score_by_price = move |segment_reader: &dyn SegmentReader| { let price = price_dynamic_column .price_for_segment(segment_reader) .unwrap(); diff --git a/src/aggregation/accessor_helpers.rs b/src/aggregation/accessor_helpers.rs index fa51041e4..22e13ac16 100644 --- a/src/aggregation/accessor_helpers.rs +++ b/src/aggregation/accessor_helpers.rs @@ -57,7 +57,7 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] { /// Get fast field reader or empty as default. pub(crate) fn get_ff_reader( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, ) -> crate::Result<(columnar::Column, ColumnType)> { @@ -74,7 +74,7 @@ pub(crate) fn get_ff_reader( } pub(crate) fn get_dynamic_columns( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, ) -> crate::Result> { let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?; @@ -90,7 +90,7 @@ pub(crate) fn get_dynamic_columns( /// /// Is guaranteed to return at least one column. pub(crate) fn get_all_ff_reader_or_empty( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, fallback_type: ColumnType, diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index ffe812129..15db902ab 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -520,7 +520,7 @@ impl AggKind { /// Build AggregationsData by walking the request tree. pub(crate) fn build_aggregations_data_from_req( aggs: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, context: AggContextParams, ) -> crate::Result { @@ -540,7 +540,7 @@ pub(crate) fn build_aggregations_data_from_req( fn build_nodes( agg_name: &str, req: &Aggregation, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, is_top_level: bool, @@ -787,7 +787,7 @@ fn build_nodes( let idx_in_req_data = data.push_filter_req_data(FilterAggReqData { name: agg_name.to_string(), req: filter_req.clone(), - segment_reader: reader.clone(), + segment_reader: reader.clone_arc(), evaluator, matching_docs_buffer, is_top_level, @@ -804,7 +804,7 @@ fn build_nodes( fn build_composite_node( agg_name: &str, - reader: &SegmentReader, + reader: &dyn SegmentReader, _segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, sub_aggs: &Aggregations, @@ -833,7 +833,7 @@ fn build_composite_node( fn build_children( aggs: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, ) -> crate::Result> { @@ -852,7 +852,7 @@ fn build_children( } fn get_term_agg_accessors( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, missing: &Option, ) -> crate::Result, ColumnType)>> { @@ -905,7 +905,7 @@ fn build_terms_or_cardinality_nodes( agg_name: &str, field_name: &str, missing: &Option, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, data: &mut AggregationsSegmentCtx, sub_aggs: &Aggregations, diff --git a/src/aggregation/bucket/composite/accessors.rs b/src/aggregation/bucket/composite/accessors.rs index 4bcfbed6a..b7ad75136 100644 --- a/src/aggregation/bucket/composite/accessors.rs +++ b/src/aggregation/bucket/composite/accessors.rs @@ -75,7 +75,7 @@ impl CompositeSourceAccessors { /// /// Precomputes some values to make collection faster. pub fn build_for_source( - reader: &SegmentReader, + reader: &dyn SegmentReader, source: &CompositeAggregationSource, // First option is None when no after key was set in the query, the // second option is None when the after key was set but its value for diff --git a/src/aggregation/bucket/filter.rs b/src/aggregation/bucket/filter.rs index 73518238a..2698b8711 100644 --- a/src/aggregation/bucket/filter.rs +++ b/src/aggregation/bucket/filter.rs @@ -1,4 +1,5 @@ use std::fmt::Debug; +use std::sync::Arc; use common::BitSet; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -402,7 +403,7 @@ pub struct FilterAggReqData { /// The filter aggregation pub req: FilterAggregation, /// The segment reader - pub segment_reader: SegmentReader, + pub segment_reader: Arc, /// Document evaluator for the filter query (precomputed BitSet) /// This is built once when the request data is created pub evaluator: DocumentQueryEvaluator, @@ -416,10 +417,9 @@ impl FilterAggReqData { pub(crate) fn get_memory_consumption(&self) -> usize { // Estimate: name + segment reader reference + bitset + buffer capacity self.name.len() - + std::mem::size_of::() - + self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes) - + self.matching_docs_buffer.capacity() * std::mem::size_of::() - + std::mem::size_of::() + + self.evaluator.bitset.get_memory_consumption() + + self.matching_docs_buffer.capacity() * std::mem::size_of::() + + std::mem::size_of::() } } @@ -438,7 +438,7 @@ impl DocumentQueryEvaluator { pub(crate) fn new( query: Box, schema: Schema, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let max_doc = segment_reader.max_doc(); diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index b254b79ee..16e2cbeb3 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -61,7 +61,7 @@ impl TermsAggReqData { + self .allowed_term_ids .as_ref() - .map(|bs| bs.len() / 8) + .map(|bs| bs.get_memory_consumption()) .unwrap_or(0) } } diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 59e9c677d..2449b493c 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -66,7 +66,7 @@ impl Collector for DistributedAggregationCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { AggregationSegmentCollector::from_agg_req_and_reader( &self.agg, @@ -96,7 +96,7 @@ impl Collector for AggregationCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { AggregationSegmentCollector::from_agg_req_and_reader( &self.agg, @@ -145,7 +145,7 @@ impl AggregationSegmentCollector { /// reader. Also includes validation, e.g. checking field types and existence. pub fn from_agg_req_and_reader( agg: &Aggregations, - reader: &SegmentReader, + reader: &dyn SegmentReader, segment_ordinal: SegmentOrdinal, context: &AggContextParams, ) -> crate::Result { diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index dcd102249..419f7a5d0 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -43,7 +43,7 @@ impl Collector for Count { fn for_segment( &self, _: SegmentOrdinal, - _: &SegmentReader, + _: &dyn SegmentReader, ) -> crate::Result { Ok(SegmentCountCollector::default()) } diff --git a/src/collector/docset_collector.rs b/src/collector/docset_collector.rs index a27a39418..8300d7a19 100644 --- a/src/collector/docset_collector.rs +++ b/src/collector/docset_collector.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use super::{Collector, SegmentCollector}; -use crate::{DocAddress, DocId, Score}; +use crate::{DocAddress, DocId, Score, SegmentReader}; /// Collectors that returns the set of DocAddress that matches the query. /// @@ -15,7 +15,7 @@ impl Collector for DocSetCollector { fn for_segment( &self, segment_local_id: crate::SegmentOrdinal, - _segment: &crate::SegmentReader, + _segment: &dyn SegmentReader, ) -> crate::Result { Ok(DocSetChildCollector { segment_local_id, diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 6eb2c3ee7..d0bca6e41 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -265,7 +265,7 @@ impl Collector for FacetCollector { fn for_segment( &self, _: SegmentOrdinal, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result { let facet_reader = reader.facet_reader(&self.field_name)?; let facet_dict = facet_reader.facet_dict(); diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index b4bada2ff..f00133a10 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -113,7 +113,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().column_opt(&self.field)?; @@ -287,7 +287,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().bytes(&self.field)?; diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index 51105e7b1..e5c6f3f9c 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -6,7 +6,7 @@ use fastdivide::DividerU64; use crate::collector::{Collector, SegmentCollector}; use crate::fastfield::{FastFieldNotAvailableError, FastValue}; use crate::schema::Type; -use crate::{DocId, Score}; +use crate::{DocId, Score, SegmentReader}; /// Histogram builds an histogram of the values of a fastfield for the /// collected DocSet. @@ -110,7 +110,7 @@ impl Collector for HistogramCollector { fn for_segment( &self, _segment_local_id: crate::SegmentOrdinal, - segment: &crate::SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment.fast_fields().u64_lenient(&self.field)?; let (column, _column_type) = column_opt.ok_or_else(|| FastFieldNotAvailableError { diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 0f8360d8d..2d2a8c16b 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -156,7 +156,7 @@ pub trait Collector: Sync + Send { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result; /// Returns true iff the collector requires to compute scores for documents. @@ -174,7 +174,7 @@ pub trait Collector: Sync + Send { &self, weight: &dyn Weight, segment_ord: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result<::Fruit> { let with_scoring = self.requires_scoring(); let mut segment_collector = self.for_segment(segment_ord, reader)?; @@ -186,7 +186,7 @@ pub trait Collector: Sync + Send { pub(crate) fn default_collect_segment_impl( segment_collector: &mut TSegmentCollector, weight: &dyn Weight, - reader: &SegmentReader, + reader: &dyn SegmentReader, with_scoring: bool, ) -> crate::Result<()> { match (reader.alive_bitset(), with_scoring) { @@ -255,7 +255,7 @@ impl Collector for Option { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { Ok(if let Some(inner) = self { let inner_segment_collector = inner.for_segment(segment_local_id, segment)?; @@ -336,7 +336,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let left = self.0.for_segment(segment_local_id, segment)?; let right = self.1.for_segment(segment_local_id, segment)?; @@ -407,7 +407,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let one = self.0.for_segment(segment_local_id, segment)?; let two = self.1.for_segment(segment_local_id, segment)?; @@ -487,7 +487,7 @@ where fn for_segment( &self, segment_local_id: u32, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let one = self.0.for_segment(segment_local_id, segment)?; let two = self.1.for_segment(segment_local_id, segment)?; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 14779c4a4..5112ce446 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -24,7 +24,7 @@ impl Collector for CollectorWrapper { fn for_segment( &self, segment_local_id: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result> { let child = self.0.for_segment(segment_local_id, reader)?; Ok(Box::new(SegmentCollectorWrapper(child))) @@ -209,7 +209,7 @@ impl Collector for MultiCollector<'_> { fn for_segment( &self, segment_local_id: SegmentOrdinal, - segment: &SegmentReader, + segment: &dyn SegmentReader, ) -> crate::Result { let children = self .collector_wrappers diff --git a/src/collector/sort_key/order.rs b/src/collector/sort_key/order.rs index 3cac357ad..c5df7d978 100644 --- a/src/collector/sort_key/order.rs +++ b/src/collector/sort_key/order.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::schema::{OwnedValue, Schema}; -use crate::{DocId, Order, Score}; +use crate::{DocId, Order, Score, SegmentReader}; fn compare_owned_value(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering { match (lhs, rhs) { @@ -430,7 +430,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let child = self.0.segment_sort_key_computer(segment_reader)?; Ok(SegmentSortKeyComputerWithComparator { @@ -468,7 +468,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let child = self.0.segment_sort_key_computer(segment_reader)?; Ok(SegmentSortKeyComputerWithComparator { diff --git a/src/collector/sort_key/sort_by_bytes.rs b/src/collector/sort_key/sort_by_bytes.rs index f6b10af2a..b6254e0f6 100644 --- a/src/collector/sort_key/sort_by_bytes.rs +++ b/src/collector/sort_key/sort_by_bytes.rs @@ -32,7 +32,7 @@ impl SortKeyComputer for SortByBytes { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn crate::SegmentReader, ) -> crate::Result { let bytes_column_opt = segment_reader.fast_fields().bytes(&self.column_name)?; Ok(ByBytesColumnSegmentSortKeyComputer { bytes_column_opt }) diff --git a/src/collector/sort_key/sort_by_erased_type.rs b/src/collector/sort_key/sort_by_erased_type.rs index 9ff4c2b40..435cc37a2 100644 --- a/src/collector/sort_key/sort_by_erased_type.rs +++ b/src/collector/sort_key/sort_by_erased_type.rs @@ -6,7 +6,7 @@ use crate::collector::sort_key::{ use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::fastfield::FastFieldNotAvailableError; use crate::schema::OwnedValue; -use crate::{DateTime, DocId, Score}; +use crate::{DateTime, DocId, Score, SegmentReader}; /// Sort by the boxed / OwnedValue representation of either a fast field, or of the score. /// @@ -86,7 +86,7 @@ impl SortKeyComputer for SortByErasedType { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let inner: Box = match self { Self::Field(column_name) => { diff --git a/src/collector/sort_key/sort_by_score.rs b/src/collector/sort_key/sort_by_score.rs index a23660e56..ca1f11a02 100644 --- a/src/collector/sort_key/sort_by_score.rs +++ b/src/collector/sort_key/sort_by_score.rs @@ -1,6 +1,6 @@ use crate::collector::sort_key::NaturalComparator; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer}; -use crate::{DocAddress, DocId, Score}; +use crate::{DocAddress, DocId, Score, SegmentReader}; /// Sort by similarity score. #[derive(Clone, Debug, Copy)] @@ -19,7 +19,7 @@ impl SortKeyComputer for SortBySimilarityScore { fn segment_sort_key_computer( &self, - _segment_reader: &crate::SegmentReader, + _segment_reader: &dyn SegmentReader, ) -> crate::Result { Ok(SortBySimilarityScore) } @@ -29,7 +29,7 @@ impl SortKeyComputer for SortBySimilarityScore { &self, k: usize, weight: &dyn crate::query::Weight, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, segment_ord: u32, ) -> crate::Result> { let mut top_n: TopNComputer = diff --git a/src/collector/sort_key/sort_by_static_fast_value.rs b/src/collector/sort_key/sort_by_static_fast_value.rs index 44a4e1d8d..6f2e67a88 100644 --- a/src/collector/sort_key/sort_by_static_fast_value.rs +++ b/src/collector/sort_key/sort_by_static_fast_value.rs @@ -61,7 +61,7 @@ impl SortKeyComputer for SortByStaticFastValue { fn segment_sort_key_computer( &self, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?; let (sort_column, _sort_column_type) = diff --git a/src/collector/sort_key/sort_by_string.rs b/src/collector/sort_key/sort_by_string.rs index 2dd0b4592..05a30c8b1 100644 --- a/src/collector/sort_key/sort_by_string.rs +++ b/src/collector/sort_key/sort_by_string.rs @@ -3,7 +3,7 @@ use columnar::StrColumn; use crate::collector::sort_key::NaturalComparator; use crate::collector::{SegmentSortKeyComputer, SortKeyComputer}; use crate::termdict::TermOrdinal; -use crate::{DocId, Score}; +use crate::{DocId, Score, SegmentReader}; /// Sort by the first value of a string column. /// @@ -35,7 +35,7 @@ impl SortKeyComputer for SortByString { fn segment_sort_key_computer( &self, - segment_reader: &crate::SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?; Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt }) diff --git a/src/collector/sort_key/sort_key_computer.rs b/src/collector/sort_key/sort_key_computer.rs index 6aab919a9..35b08dc45 100644 --- a/src/collector/sort_key/sort_key_computer.rs +++ b/src/collector/sort_key/sort_key_computer.rs @@ -119,7 +119,7 @@ pub trait SortKeyComputer: Sync { &self, k: usize, weight: &dyn crate::query::Weight, - reader: &crate::SegmentReader, + reader: &dyn SegmentReader, segment_ord: u32, ) -> crate::Result> { let with_scoring = self.requires_scoring(); @@ -135,7 +135,7 @@ pub trait SortKeyComputer: Sync { } /// Builds a child sort key computer for a specific segment. - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result; + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result; } impl SortKeyComputer @@ -156,7 +156,7 @@ where (self.0.comparator(), self.1.comparator()) } - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { Ok(( self.0.segment_sort_key_computer(segment_reader)?, self.1.segment_sort_key_computer(segment_reader)?, @@ -357,7 +357,7 @@ where ) } - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?; let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?; let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?; @@ -420,7 +420,7 @@ where SortKeyComputer4::Comparator, ); - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?; let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?; let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?; @@ -454,7 +454,7 @@ where impl SortKeyComputer for F where - F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF, + F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> SegmentF, SegmentF: 'static + FnMut(DocId) -> TSortKey, TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug, { @@ -462,7 +462,7 @@ where type Child = SegmentF; type Comparator = NaturalComparator; - fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result { + fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result { Ok((self)(segment_reader)) } } @@ -509,10 +509,10 @@ mod tests { #[test] fn test_lazy_score_computer() { - let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32; + let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32; let call_count = Arc::new(AtomicUsize::new(0)); let call_count_clone = call_count.clone(); - let score_computer_secondary = move |_segment_reader: &SegmentReader| { + let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| { let call_count_new_clone = call_count_clone.clone(); move |_doc: DocId| { call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst); @@ -572,10 +572,10 @@ mod tests { #[test] fn test_lazy_score_computer_dynamic_ordering() { - let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32; + let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32; let call_count = Arc::new(AtomicUsize::new(0)); let call_count_clone = call_count.clone(); - let score_computer_secondary = move |_segment_reader: &SegmentReader| { + let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| { let call_count_new_clone = call_count_clone.clone(); move |_doc: DocId| { call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst); diff --git a/src/collector/sort_key_top_collector.rs b/src/collector/sort_key_top_collector.rs index 9ca47581b..6995973a2 100644 --- a/src/collector/sort_key_top_collector.rs +++ b/src/collector/sort_key_top_collector.rs @@ -32,7 +32,11 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static self.sort_key_computer.check_schema(schema) } - fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result { + fn for_segment( + &self, + segment_ord: u32, + segment_reader: &dyn SegmentReader, + ) -> Result { let segment_sort_key_computer = self .sort_key_computer .segment_sort_key_computer(segment_reader)?; @@ -63,7 +67,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static &self, weight: &dyn Weight, segment_ord: u32, - reader: &SegmentReader, + reader: &dyn SegmentReader, ) -> crate::Result> { let k = self.doc_range.end; let docs = self diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 61b6a595b..ef8068124 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -5,7 +5,7 @@ use crate::query::{AllQuery, QueryParser}; use crate::schema::{Schema, FAST, TEXT}; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; -use crate::{DateTime, DocAddress, Index, Searcher, TantivyDocument}; +use crate::{DateTime, DocAddress, Index, Searcher, SegmentReader, TantivyDocument}; pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector { compute_score: true, @@ -109,7 +109,7 @@ impl Collector for TestCollector { fn for_segment( &self, segment_id: SegmentOrdinal, - _reader: &SegmentReader, + _reader: &dyn SegmentReader, ) -> crate::Result { Ok(TestSegmentCollector { segment_id, @@ -180,7 +180,7 @@ impl Collector for FastFieldTestCollector { fn for_segment( &self, _: SegmentOrdinal, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let reader = segment_reader .fast_fields() @@ -243,7 +243,7 @@ impl Collector for BytesFastFieldTestCollector { fn for_segment( &self, _segment_local_id: u32, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { let column_opt = segment_reader.fast_fields().bytes(&self.field)?; Ok(BytesFastFieldSegmentCollector { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 0ce1c611a..086364853 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -393,7 +393,7 @@ impl TopDocs { /// // This is where we build our collector with our custom score. /// let top_docs_by_custom_score = TopDocs /// ::with_limit(10) - /// .tweak_score(move |segment_reader: &SegmentReader| { + /// .tweak_score(move |segment_reader: &dyn SegmentReader| { /// // The argument is a function that returns our scoring /// // function. /// // @@ -442,7 +442,7 @@ pub struct TweakScoreFn(F); impl SortKeyComputer for TweakScoreFn where - F: 'static + Send + Sync + Fn(&SegmentReader) -> TTweakScoreSortKeyFn, + F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> TTweakScoreSortKeyFn, TTweakScoreSortKeyFn: 'static + Fn(DocId, Score) -> TSortKey, TweakScoreSegmentSortKeyComputer: SegmentSortKeyComputer, @@ -458,7 +458,7 @@ where fn segment_sort_key_computer( &self, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, ) -> crate::Result { Ok({ TweakScoreSegmentSortKeyComputer { @@ -1525,7 +1525,7 @@ mod tests { let text_query = query_parser.parse_query("droopy tax")?; let collector = TopDocs::with_limit(2) .and_offset(1) - .order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc); + .order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc); let score_docs: Vec<(u32, DocAddress)> = index.reader()?.searcher().search(&text_query, &collector)?; assert_eq!( @@ -1543,7 +1543,7 @@ mod tests { let text_query = query_parser.parse_query("droopy tax").unwrap(); let collector = TopDocs::with_limit(2) .and_offset(1) - .order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc); + .order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc); let score_docs: Vec<(u32, DocAddress)> = index .reader() .unwrap() diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 7f2094e53..8a3ac1e2b 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -4,7 +4,7 @@ use common::{replace_in_place, JsonPathWriter}; use rustc_hash::FxHashMap; use crate::indexer::indexing_term::IndexingTerm; -use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; +use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter as _, PostingsWriterEnum}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; @@ -80,7 +80,7 @@ fn index_json_object<'a, V: Value<'a>>( text_analyzer: &mut TextAnalyzer, term_buffer: &mut IndexingTerm, json_path_writer: &mut JsonPathWriter, - postings_writer: &mut dyn PostingsWriter, + postings_writer: &mut PostingsWriterEnum, ctx: &mut IndexingContext, positions_per_path: &mut IndexingPositionsPerPath, ) { @@ -110,7 +110,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( text_analyzer: &mut TextAnalyzer, term_buffer: &mut IndexingTerm, json_path_writer: &mut JsonPathWriter, - postings_writer: &mut dyn PostingsWriter, + postings_writer: &mut PostingsWriterEnum, ctx: &mut IndexingContext, positions_per_path: &mut IndexingPositionsPerPath, ) { diff --git a/src/core/mod.rs b/src/core/mod.rs index db4ab2896..6e384c16b 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,7 +8,7 @@ use std::path::Path; use once_cell::sync::Lazy; pub use self::executor::Executor; -pub use self::searcher::{Searcher, SearcherGeneration}; +pub use self::searcher::{Searcher, SearcherContext, SearcherGeneration}; /// The meta file contains all the information about the list of segments and the schema /// of the index. diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9603d0f4f..b25760224 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -4,13 +4,13 @@ use std::{fmt, io}; use crate::collector::Collector; use crate::core::Executor; -use crate::index::{SegmentId, SegmentReader}; +use crate::index::{Index, SegmentId, SegmentReader}; use crate::query::{Bm25StatisticsProvider, EnableScoring, Query}; -use crate::schema::document::DocumentDeserialize; -use crate::schema::{Schema, Term}; +use crate::schema::{Field, FieldType, Schema, TantivyDocument, Term}; use crate::space_usage::SearcherSpaceUsage; -use crate::store::{CacheStats, StoreReader}; -use crate::{DocAddress, Index, Opstamp, TrackedObject}; +use crate::store::{CacheStats, StoreReader, DOCSTORE_CACHE_CAPACITY}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::{DocAddress, Inventory, Opstamp, TantivyError, TrackedObject}; /// Identifies the searcher generation accessed by a [`Searcher`]. /// @@ -36,7 +36,7 @@ pub struct SearcherGeneration { impl SearcherGeneration { pub(crate) fn from_segment_readers( - segment_readers: &[SegmentReader], + segment_readers: &[Arc], generation_id: u64, ) -> Self { let mut segment_id_to_del_opstamp = BTreeMap::new(); @@ -61,6 +61,103 @@ impl SearcherGeneration { } } +/// Search-time context required by a [`Searcher`]. +#[derive(Clone)] +pub struct SearcherContext { + schema: Schema, + executor: Executor, + tokenizers: TokenizerManager, + fast_field_tokenizers: TokenizerManager, +} + +impl SearcherContext { + /// Creates a context from explicit search-time components. + pub fn new( + schema: Schema, + executor: Executor, + tokenizers: TokenizerManager, + fast_field_tokenizers: TokenizerManager, + ) -> SearcherContext { + SearcherContext { + schema, + executor, + tokenizers, + fast_field_tokenizers, + } + } + + /// Creates a context from an index. + pub fn from_index(index: &Index) -> SearcherContext { + SearcherContext::new( + index.schema(), + index.search_executor().clone(), + index.tokenizers().clone(), + index.fast_field_tokenizer().clone(), + ) + } + + /// Access the schema associated with this context. + pub fn schema(&self) -> &Schema { + &self.schema + } + + /// Access the executor associated with this context. + pub fn search_executor(&self) -> &Executor { + &self.executor + } + + /// Access the tokenizer manager associated with this context. + pub fn tokenizers(&self) -> &TokenizerManager { + &self.tokenizers + } + + /// Access the fast field tokenizer manager associated with this context. + pub fn fast_field_tokenizer(&self) -> &TokenizerManager { + &self.fast_field_tokenizers + } + + /// Get the tokenizer associated with a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> crate::Result { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let indexing_options_opt = match field_type { + FieldType::JsonObject(options) => options.get_text_indexing_options(), + FieldType::Str(options) => options.get_indexing_options(), + _ => { + return Err(TantivyError::SchemaError(format!( + "{:?} is not a text field.", + field_entry.name() + ))) + } + }; + let indexing_options = indexing_options_opt.ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "No indexing options set for field {field_entry:?}" + )) + })?; + + self.tokenizers + .get(indexing_options.tokenizer()) + .ok_or_else(|| { + TantivyError::InvalidArgument(format!( + "No Tokenizer found for field {field_entry:?}" + )) + }) + } +} + +impl From<&Index> for SearcherContext { + fn from(index: &Index) -> Self { + SearcherContext::from_index(index) + } +} + +impl From for SearcherContext { + fn from(index: Index) -> Self { + SearcherContext::from(&index) + } +} + /// Holds a list of `SegmentReader`s ready for search. /// /// It guarantees that the `Segment` will not be removed before @@ -71,9 +168,66 @@ pub struct Searcher { } impl Searcher { - /// Returns the `Index` associated with the `Searcher` - pub fn index(&self) -> &Index { - &self.inner.index + /// Creates a `Searcher` from an arbitrary list of segment readers. + /// + /// This is useful when segment readers are not opened from + /// `IndexReader` / `meta.json` (e.g. external segment sources). + /// The generated [`SearcherGeneration`] uses `generation_id = 0`. + pub fn from_segment_readers>( + context: Ctx, + segment_readers: Vec>, + ) -> crate::Result { + Self::from_segment_readers_with_generation_id(context, segment_readers, 0) + } + + /// Same as [`Searcher::from_segment_readers`] but allows setting + /// a custom generation id. + pub fn from_segment_readers_with_generation_id>( + context: Ctx, + segment_readers: Vec>, + generation_id: u64, + ) -> crate::Result { + let context = context.into(); + let generation = SearcherGeneration::from_segment_readers(&segment_readers, generation_id); + let tracked_generation = Inventory::default().track(generation); + let inner = SearcherInner::new( + context, + segment_readers, + tracked_generation, + DOCSTORE_CACHE_CAPACITY, + )?; + Ok(Arc::new(inner).into()) + } + + /// Returns the search context associated with the `Searcher`. + pub fn context(&self) -> &SearcherContext { + &self.inner.context + } + + /// Deprecated alias for [`Searcher::context`]. + #[deprecated(note = "use Searcher::context()")] + pub fn index(&self) -> &SearcherContext { + self.context() + } + + /// Access the search executor associated with this searcher. + pub fn search_executor(&self) -> &Executor { + self.context().search_executor() + } + + /// Access the tokenizer manager associated with this searcher. + pub fn tokenizers(&self) -> &TokenizerManager { + self.context().tokenizers() + } + + /// Access the fast field tokenizer manager associated with this searcher. + pub fn fast_field_tokenizer(&self) -> &TokenizerManager { + self.context().fast_field_tokenizer() + } + + /// Get the tokenizer associated with a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> crate::Result { + self.context().tokenizer_for_field(field) } /// [`SearcherGeneration`] which identifies the version of the snapshot held by this `Searcher`. @@ -85,7 +239,7 @@ impl Searcher { /// /// The searcher uses the segment ordinal to route the /// request to the right `Segment`. - pub fn doc(&self, doc_address: DocAddress) -> crate::Result { + pub fn doc(&self, doc_address: DocAddress) -> crate::Result { let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; store_reader.get(doc_address.doc_id) } @@ -105,18 +259,15 @@ impl Searcher { /// Fetches a document in an asynchronous manner. #[cfg(feature = "quickwit")] - pub async fn doc_async( - &self, - doc_address: DocAddress, - ) -> crate::Result { - let executor = self.inner.index.search_executor(); + pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result { + let executor = self.search_executor(); let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; store_reader.get_async(doc_address.doc_id, executor).await } /// Access the schema associated with the index of this searcher. pub fn schema(&self) -> &Schema { - &self.inner.schema + self.context().schema() } /// Returns the overall number of documents in the index. @@ -154,13 +305,13 @@ impl Searcher { } /// Return the list of segment readers - pub fn segment_readers(&self) -> &[SegmentReader] { + pub fn segment_readers(&self) -> &[Arc] { &self.inner.segment_readers } /// Returns the segment_reader associated with the given segment_ord - pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader { - &self.inner.segment_readers[segment_ord as usize] + pub fn segment_reader(&self, segment_ord: u32) -> &dyn SegmentReader { + self.inner.segment_readers[segment_ord as usize].as_ref() } /// Runs a query on the segment readers wrapped by the searcher. @@ -201,7 +352,7 @@ impl Searcher { } else { EnableScoring::disabled_from_searcher(self) }; - let executor = self.inner.index.search_executor(); + let executor = self.search_executor(); self.search_with_executor(query, collector, executor, enabled_scoring) } @@ -229,7 +380,11 @@ impl Searcher { let segment_readers = self.segment_readers(); let fruits = executor.map( |(segment_ord, segment_reader)| { - collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader) + collector.collect_segment( + weight.as_ref(), + segment_ord as u32, + segment_reader.as_ref(), + ) }, segment_readers.iter().enumerate(), )?; @@ -257,19 +412,17 @@ impl From> for Searcher { /// It guarantees that the `Segment` will not be removed before /// the destruction of the `Searcher`. pub(crate) struct SearcherInner { - schema: Schema, - index: Index, - segment_readers: Vec, - store_readers: Vec, + context: SearcherContext, + segment_readers: Vec>, + store_readers: Vec>, generation: TrackedObject, } impl SearcherInner { /// Creates a new `Searcher` pub(crate) fn new( - schema: Schema, - index: Index, - segment_readers: Vec, + context: SearcherContext, + segment_readers: Vec>, generation: TrackedObject, doc_store_cache_num_blocks: usize, ) -> io::Result { @@ -281,14 +434,13 @@ impl SearcherInner { generation.segments(), "Set of segments referenced by this Searcher and its SearcherGeneration must match" ); - let store_readers: Vec = segment_readers + let store_readers: Vec> = segment_readers .iter() .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks)) .collect::>>()?; Ok(SearcherInner { - schema, - index, + context, segment_readers, store_readers, generation, @@ -301,7 +453,7 @@ impl fmt::Debug for Searcher { let segment_ids = self .segment_readers() .iter() - .map(SegmentReader::segment_id) + .map(|segment_reader| segment_reader.segment_id()) .collect::>(); write!(f, "Searcher({segment_ids:?})") } diff --git a/src/core/tests.rs b/src/core/tests.rs index 62baedf1d..d97e65884 100644 --- a/src/core/tests.rs +++ b/src/core/tests.rs @@ -7,8 +7,8 @@ use crate::query::TermQuery; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT}; use crate::tokenizer::TokenizerManager; use crate::{ - Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy, - TantivyDocument, Term, + Directory, DocSet, Executor, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, + ReloadPolicy, Searcher, SearcherContext, TantivyDocument, Term, }; #[test] @@ -300,6 +300,40 @@ fn test_single_segment_index_writer() -> crate::Result<()> { Ok(()) } +#[test] +fn test_searcher_from_external_segment_readers() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let mut writer: IndexWriter = index.writer_for_tests()?; + writer.add_document(doc!(text_field => "hello"))?; + writer.add_document(doc!(text_field => "hello"))?; + writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + let segment_readers = searcher.segment_readers().to_vec(); + let context = SearcherContext::new( + schema, + Executor::single_thread(), + TokenizerManager::default(), + TokenizerManager::default(), + ); + let custom_searcher = + Searcher::from_segment_readers_with_generation_id(context, segment_readers, 42)?; + + let term_query = TermQuery::new( + Term::from_field_text(text_field, "hello"), + IndexRecordOption::Basic, + ); + let count = custom_searcher.search(&term_query, &Count)?; + assert_eq!(count, 2); + assert_eq!(custom_searcher.generation().generation_id(), 42); + assert_eq!(custom_searcher.segment_readers().len(), 1); + Ok(()) +} + #[test] fn test_merging_segment_update_docfreq() { let mut schema_builder = Schema::builder(); diff --git a/src/directory/composite_file.rs b/src/directory/composite_file.rs index 93e063880..6da24a59b 100644 --- a/src/directory/composite_file.rs +++ b/src/directory/composite_file.rs @@ -167,7 +167,9 @@ impl CompositeFile { .map(|byte_range| self.data.slice(byte_range.clone())) } - /// Returns the space usage per field in this composite file. + /// Returns per-field byte usage for all slices stored in this composite file. + /// + /// The provided `schema` is used to resolve field ids into field names. pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { let mut fields = Vec::new(); for (&field_addr, byte_range) in &self.offsets_index { diff --git a/src/docset.rs b/src/docset.rs index 8e72281d2..8b8985004 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -1,4 +1,7 @@ -use std::borrow::{Borrow, BorrowMut}; +use std::borrow::BorrowMut; +use std::ops::{Deref as _, DerefMut as _}; + +use common::BitSet; use crate::fastfield::AliveBitSet; use crate::DocId; @@ -130,6 +133,19 @@ pub trait DocSet: Send { buffer.len() } + /// Fills the given bitset with the documents in the docset. + /// + /// If the docset max_doc is smaller than the largest doc, this function might not consume the + /// docset entirely. + fn fill_bitset(&mut self, bitset: &mut BitSet) { + let bitset_max_value: u32 = bitset.max_value(); + let mut doc = self.doc(); + while doc < bitset_max_value { + bitset.insert(doc); + doc = self.advance(); + } + } + /// Returns the current document /// Right after creating a new `DocSet`, the docset points to the first document. /// @@ -233,51 +249,59 @@ impl DocSet for &mut dyn DocSet { fn count_including_deleted(&mut self) -> u32 { (**self).count_including_deleted() } + + fn fill_bitset(&mut self, bitset: &mut BitSet) { + (**self).fill_bitset(bitset); + } } impl DocSet for Box { + #[inline] fn advance(&mut self) -> DocId { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.advance() + self.deref_mut().advance() } + #[inline] fn seek(&mut self, target: DocId) -> DocId { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.seek(target) + self.deref_mut().seek(target) } + #[inline] fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.seek_danger(target) } + #[inline] fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.fill_buffer(buffer) + self.deref_mut().fill_buffer(buffer) } + #[inline] fn doc(&self) -> DocId { - let unboxed: &TDocSet = self.borrow(); - unboxed.doc() + self.deref().doc() } + #[inline] fn size_hint(&self) -> u32 { - let unboxed: &TDocSet = self.borrow(); - unboxed.size_hint() + self.deref().size_hint() } + #[inline] fn cost(&self) -> u64 { - let unboxed: &TDocSet = self.borrow(); - unboxed.cost() + self.deref().cost() } + #[inline] fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.count(alive_bitset) + self.deref_mut().count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { - let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.count_including_deleted() + self.deref_mut().count_including_deleted() + } + + fn fill_bitset(&mut self, bitset: &mut BitSet) { + self.deref_mut().fill_bitset(bitset); } } diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index b93cff20b..96e47f256 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -84,9 +84,7 @@ mod tests { let mut facet = Facet::default(); facet_reader.facet_from_ord(0, &mut facet).unwrap(); assert_eq!(facet.to_path_string(), "/a/b"); - let doc = searcher - .doc::(DocAddress::new(0u32, 0u32)) - .unwrap(); + let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap(); let value = doc .get_first(facet_field) .and_then(|v| v.as_value().as_facet()); @@ -145,7 +143,7 @@ mod tests { let mut facet_ords = Vec::new(); facet_ords.extend(facet_reader.facet_ords(0u32)); assert_eq!(&facet_ords, &[0u64]); - let doc = searcher.doc::(DocAddress::new(0u32, 0u32))?; + let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let value: Option = doc .get_first(facet_field) .and_then(|v| v.as_facet()) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index aca53c212..4478c6864 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -96,7 +96,7 @@ mod tests { }; use crate::time::OffsetDateTime; use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; - use crate::{Index, IndexWriter, SegmentReader}; + use crate::{Index, IndexWriter}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); @@ -430,7 +430,7 @@ mod tests { .searcher() .segment_readers() .iter() - .map(SegmentReader::segment_id) + .map(|segment_reader| segment_reader.segment_id()) .collect(); assert_eq!(segment_ids.len(), 2); index_writer.merge(&segment_ids[..]).wait().unwrap(); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 083f79532..c98683528 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -25,7 +25,8 @@ pub struct FastFieldReaders { } impl FastFieldReaders { - pub(crate) fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result { + /// Opens the segment fast-field container and binds it to a schema. + pub fn open(fast_field_file: FileSlice, schema: Schema) -> io::Result { let columnar = Arc::new(ColumnarReader::open(fast_field_file)?); Ok(FastFieldReaders { columnar, schema }) } @@ -39,7 +40,8 @@ impl FastFieldReaders { self.resolve_column_name_given_default_field(column_name, default_field_opt) } - pub(crate) fn space_usage(&self) -> io::Result { + /// Returns per-field space usage for all loaded fast-field columns. + pub fn space_usage(&self) -> io::Result { let mut per_field_usages: Vec = Default::default(); for (mut field_name, column_handle) in self.columnar.iter_columns()? { json_path_sep_to_dot(&mut field_name); @@ -51,7 +53,8 @@ impl FastFieldReaders { Ok(PerFieldSpaceUsage::new(per_field_usages)) } - pub(crate) fn columnar(&self) -> &ColumnarReader { + /// Returns the underlying `ColumnarReader`. + pub fn columnar(&self) -> &ColumnarReader { self.columnar.as_ref() } diff --git a/src/index/index.rs b/src/index/index.rs index 5495ddced..462c0b8fb 100644 --- a/src/index/index.rs +++ b/src/index/index.rs @@ -3,11 +3,12 @@ use std::fmt; #[cfg(feature = "mmap")] use std::path::Path; use std::path::PathBuf; +use std::sync::Arc; use std::thread::available_parallelism; use super::segment::Segment; use super::segment_reader::merge_field_meta_data; -use super::{FieldMetadata, IndexSettings}; +use super::{FieldMetadata, IndexSettings, TantivySegmentReader}; use crate::core::{Executor, META_FILEPATH}; use crate::directory::error::OpenReadError; #[cfg(feature = "mmap")] @@ -24,7 +25,6 @@ use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::schema::document::Document; use crate::schema::{Field, FieldType, Schema}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; -use crate::SegmentReader; fn load_metas( directory: &dyn Directory, @@ -244,9 +244,12 @@ impl IndexBuilder { /// Creates a new index given an implementation of the trait `Directory`. /// /// If a directory previously existed, it will be erased. - fn create>>(self, dir: T) -> crate::Result { + pub fn create>>(self, dir: T) -> crate::Result { + self.create_avoid_monomorphization(dir.into()) + } + + fn create_avoid_monomorphization(self, dir: Box) -> crate::Result { self.validate()?; - let dir = dir.into(); let directory = ManagedDirectory::wrap(dir)?; save_new_metas( self.get_expect_schema()?, @@ -255,7 +258,7 @@ impl IndexBuilder { )?; let mut metas = IndexMeta::with_schema(self.get_expect_schema()?); metas.index_settings = self.index_settings; - let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default()); + let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default())?; index.set_tokenizers(self.tokenizer_manager); index.set_fast_field_tokenizers(self.fast_field_tokenizer_manager); Ok(index) @@ -381,9 +384,9 @@ impl Index { directory: ManagedDirectory, metas: &IndexMeta, inventory: SegmentMetaInventory, - ) -> Index { + ) -> crate::Result { let schema = metas.schema.clone(); - Index { + Ok(Index { settings: metas.index_settings.clone(), directory, schema, @@ -391,7 +394,7 @@ impl Index { fast_field_tokenizers: TokenizerManager::default(), executor: Executor::single_thread(), inventory, - } + }) } /// Setter for the tokenizer manager. @@ -492,7 +495,16 @@ impl Index { let segments = self.searchable_segments()?; let fields_metadata: Vec> = segments .into_iter() - .map(|segment| SegmentReader::open(&segment)?.fields_metadata()) + .map(|segment| { + let reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let reader: Arc = Arc::new(reader); + reader.fields_metadata() + }) .collect::>()?; Ok(merge_field_meta_data(fields_metadata)) } @@ -512,8 +524,7 @@ impl Index { let directory = ManagedDirectory::wrap(directory)?; let inventory = SegmentMetaInventory::default(); let metas = load_metas(&directory, &inventory)?; - let index = Index::open_from_metas(directory, &metas, inventory); - Ok(index) + Index::open_from_metas(directory, &metas, inventory) } /// Reads the index meta file from the directory. diff --git a/src/index/index_meta.rs b/src/index/index_meta.rs index 8c7983116..bb4d79064 100644 --- a/src/index/index_meta.rs +++ b/src/index/index_meta.rs @@ -287,7 +287,6 @@ pub struct IndexMeta { #[serde(skip_serializing_if = "Option::is_none")] pub payload: Option, } - #[derive(Deserialize, Debug)] struct UntrackedIndexMeta { pub segments: Vec, @@ -379,13 +378,36 @@ mod tests { opstamp: 0u64, payload: None, }; - let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); + let json_value: serde_json::Value = + serde_json::to_value(&index_metas).expect("serialization failed"); assert_eq!( - json, - r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + &json_value, + &serde_json::json!( + { + "index_settings": { + "docstore_compression": "none", + "docstore_blocksize": 16384 + }, + "segments": [], + "schema": [ + { + "name": "text", + "type": "text", + "options": { + "indexing": { + "record": "position", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": false, + "fast": false + } + } + ], + "opstamp": 0 + }) ); - - let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); + let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap(); assert_eq!(index_metas.index_settings, deser_meta.index_settings); assert_eq!(index_metas.schema, deser_meta.schema); assert_eq!(index_metas.opstamp, deser_meta.opstamp); @@ -412,13 +434,37 @@ mod tests { opstamp: 0u64, payload: None, }; - let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); + let json_value = serde_json::to_value(&index_metas).expect("serialization failed"); assert_eq!( - json, - r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + &json_value, + &serde_json::json!( + { + "index_settings": { + "docstore_compression": "zstd(compression_level=4)", + "docstore_blocksize": 1000000 + }, + "segments": [], + "schema": [ + { + "name": "text", + "type": "text", + "options": { + "indexing": { + "record": "position", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": false, + "fast": false + } + } + ], + "opstamp": 0 + } + ) ); - let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); + let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap(); assert_eq!(index_metas.index_settings, deser_meta.index_settings); assert_eq!(index_metas.schema, deser_meta.schema); assert_eq!(index_metas.opstamp, deser_meta.opstamp); diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs index 7314f8741..8a9e38f48 100644 --- a/src/index/inverted_index_reader.rs +++ b/src/index/inverted_index_reader.rs @@ -1,7 +1,12 @@ +use std::any::Any; +#[cfg(feature = "quickwit")] +use std::future::Future; use std::io; +#[cfg(feature = "quickwit")] +use std::pin::Pin; use common::json_path_writer::JSON_END_OF_PATH; -use common::{BinarySerializable, ByteCount}; +use common::{BinarySerializable, BitSet, ByteCount, OwnedBytes}; #[cfg(feature = "quickwit")] use futures_util::{FutureExt, StreamExt, TryStreamExt}; #[cfg(feature = "quickwit")] @@ -10,37 +15,262 @@ use itertools::Itertools; use tantivy_fst::automaton::{AlwaysMatch, Automaton}; use crate::directory::FileSlice; -use crate::positions::PositionReader; -use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; +use crate::docset::DocSet; +use crate::postings::{ + load_postings_from_raw_data, Postings, RawPostingsData, SegmentPostings, TermInfo, +}; use crate::schema::{IndexRecordOption, Term, Type}; use crate::termdict::TermDictionary; +#[cfg(feature = "quickwit")] +pub type TermRangeBounds = (std::ops::Bound, std::ops::Bound); + +/// Trait defining the contract for a dynamically dispatched inverted index reader. +pub trait DynInvertedIndexReader: Send + Sync { + /// Downcasts to the concrete reader type when possible. + fn as_any(&self) -> &dyn Any; + + /// Returns the term info associated with the term. + fn get_term_info(&self, term: &Term) -> io::Result> { + self.terms().get(term.serialized_value_bytes()) + } + + /// Return the term dictionary datastructure. + fn terms(&self) -> &TermDictionary; + + /// Return the fields and types encoded in the dictionary in lexicographic order. + /// Only valid on JSON fields. + /// + /// Notice: This requires a full scan and therefore **very expensive**. + fn list_encoded_json_fields(&self) -> io::Result>; + + /// Returns the raw postings bytes and metadata for a term. + fn read_raw_postings_data( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result; + + /// Returns the total number of tokens recorded for all documents + /// (including deleted documents). + fn total_num_tokens(&self) -> u64; + + /// Returns the segment postings associated with the term, and with the given option, + /// or `None` if the term has never been encountered and indexed. + fn read_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> io::Result>> { + self.get_term_info(term)? + .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) + .transpose() + } + + /// Returns the postings for a given `TermInfo`. + /// + /// The default implementation decodes via [`read_raw_postings_data`]. Custom readers + /// that cannot produce valid raw postings bytes (e.g. merged/union posting sources) + /// should override this method. + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result> { + let postings_data = self.read_raw_postings_data(term_info, option)?; + let postings = load_postings_from_raw_data(term_info.doc_freq, postings_data)?; + Ok(Box::new(postings)) + } + + /// Returns the number of documents containing the term. + fn doc_freq(&self, term: &Term) -> io::Result; + + /// Returns the number of documents containing the term asynchronously. + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + term: &'a Term, + ) -> Pin> + Send + 'a>>; + + /// Warmup fieldnorm readers for this inverted index field. + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>>; + + /// Warmup the block postings for all terms. + /// + /// Default implementation is a no-op. + #[cfg(feature = "quickwit")] + fn warm_postings_full<'a>( + &'a self, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(()) }) + } + + /// Warmup a block postings given a `Term`. + /// + /// Returns whether the term was found in the dictionary. + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + term: &'a Term, + with_positions: bool, + ) -> Pin> + Send + 'a>>; + + /// Warmup block postings for terms in a range. + /// + /// Returns whether at least one matching term was found. + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + terms: TermRangeBounds, + limit: Option, + with_positions: bool, + ) -> Pin> + Send + 'a>>; + + /// Warmup block postings for terms matching an automaton. + /// + /// Returns whether at least one matching term was found. + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + Self: Sized; +} + +/// Trait defining the contract for a typed inverted index reader. +pub trait InvertedIndexReader: Send + Sync { + /// The concrete postings type returned by this reader. + type Postings: Postings; + + /// A lighter doc-id-only iterator returned when frequencies and positions are not needed. + type DocSet: DocSet; + + /// Returns a posting object given a `term_info`. + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result; + + /// Returns a doc-id-only iterator for the given term. + /// + /// Always reads with `IndexRecordOption::Basic` — no frequency decoding, + /// no position reader. + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result; + + /// Fills a bitset with the doc ids for the given term. + fn fill_bitset_from_terminfo( + &self, + term_info: &TermInfo, + doc_bitset: &mut BitSet, + ) -> io::Result<()> { + let mut docset = self.read_docset_from_terminfo(term_info)?; + docset.fill_bitset(doc_bitset); + Ok(()) + } +} + +impl InvertedIndexReader for dyn DynInvertedIndexReader + '_ { + type Postings = Box; + type DocSet = Box; + + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + DynInvertedIndexReader::read_postings_from_terminfo(self, term_info, option) + } + + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result { + DynInvertedIndexReader::read_postings_from_terminfo( + self, + term_info, + IndexRecordOption::Basic, + ) + } +} + +/// Handler interface used by [`try_downcast_and_call`] to build query objects. +pub trait TypedInvertedIndexReaderCb { + /// Invokes the handler with either Tantivy's built-in typed reader or the dynamic fallback. + fn call(&mut self, reader: &I) -> R; +} + +/// Tries Tantivy's built-in reader downcast before falling back to the dynamic reader path. +pub fn try_downcast_and_call(reader: &dyn DynInvertedIndexReader, handler: &mut C) -> R +where C: TypedInvertedIndexReaderCb { + if let Some(reader) = reader.as_any().downcast_ref::() { + return handler.call(reader); + } + handler.call(reader) +} + +struct LoadPostingsFromTermInfo<'a> { + term_info: &'a TermInfo, + option: IndexRecordOption, +} + +impl TypedInvertedIndexReaderCb>> for LoadPostingsFromTermInfo<'_> { + fn call( + &mut self, + reader: &I, + ) -> io::Result> { + let postings = reader.read_postings_from_terminfo(self.term_info, self.option)?; + Ok(Box::new(postings)) + } +} + +pub(crate) fn load_postings_from_terminfo( + reader: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, +) -> io::Result> { + let mut postings_loader = LoadPostingsFromTermInfo { term_info, option }; + try_downcast_and_call(reader, &mut postings_loader) +} + +/// Tantivy's default inverted index reader implementation. +/// /// The inverted index reader is in charge of accessing /// the inverted index associated with a specific field. /// /// # Note /// /// It is safe to delete the segment associated with -/// an `InvertedIndexReader`. As long as it is open, +/// an `InvertedIndexReader` implementation. As long as it is open, /// the [`FileSlice`] it is relying on should /// stay available. /// -/// `InvertedIndexReader` are created by calling +/// `TantivyInvertedIndexReader` instances are created by calling /// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index). -pub struct InvertedIndexReader { +pub struct TantivyInvertedIndexReader { termdict: TermDictionary, postings_file_slice: FileSlice, positions_file_slice: FileSlice, + #[cfg_attr(not(feature = "quickwit"), allow(dead_code))] + fieldnorms_file_slice: FileSlice, record_option: IndexRecordOption, total_num_tokens: u64, } /// Object that records the amount of space used by a field in an inverted index. -pub(crate) struct InvertedIndexFieldSpace { +pub struct InvertedIndexFieldSpace { + /// Field name as encoded in the term dictionary. pub field_name: String, + /// Value type for the encoded field. pub field_type: Type, + /// Total bytes used by postings for this field. pub postings_size: ByteCount, + /// Total bytes used by positions for this field. pub positions_size: ByteCount, + /// Number of terms in the field. pub num_terms: u64, } @@ -62,52 +292,81 @@ impl InvertedIndexFieldSpace { } } -impl InvertedIndexReader { - pub(crate) fn new( +impl TantivyInvertedIndexReader { + pub(crate) fn read_raw_postings_data_inner( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + let effective_option = option.downgrade(self.record_option); + let postings_data = self + .postings_file_slice + .slice(term_info.postings_range.clone()) + .read_bytes()?; + let positions_data: Option = if effective_option.has_positions() { + let positions_data = self + .positions_file_slice + .slice(term_info.positions_range.clone()) + .read_bytes()?; + Some(positions_data) + } else { + None + }; + Ok(RawPostingsData { + postings_data, + positions_data, + record_option: self.record_option, + effective_option, + }) + } + + /// Opens an inverted index reader from already-loaded term/postings/positions slices. + /// + /// The first 8 bytes of `postings_file_slice` are expected to contain + /// the serialized total token count. + pub fn new( termdict: TermDictionary, postings_file_slice: FileSlice, positions_file_slice: FileSlice, + fieldnorms_file_slice: FileSlice, record_option: IndexRecordOption, - ) -> io::Result { + ) -> io::Result { let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8); let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?; - Ok(InvertedIndexReader { + Ok(TantivyInvertedIndexReader { termdict, postings_file_slice: postings_body, positions_file_slice, + fieldnorms_file_slice, record_option, total_num_tokens, }) } - /// Creates an empty `InvertedIndexReader` object, which + /// Creates an empty `TantivyInvertedIndexReader` object, which /// contains no terms at all. - pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader { - InvertedIndexReader { + pub fn empty(record_option: IndexRecordOption) -> TantivyInvertedIndexReader { + TantivyInvertedIndexReader { termdict: TermDictionary::empty(), postings_file_slice: FileSlice::empty(), positions_file_slice: FileSlice::empty(), + fieldnorms_file_slice: FileSlice::empty(), record_option, total_num_tokens: 0u64, } } +} - /// Returns the term info associated with the term. - pub fn get_term_info(&self, term: &Term) -> io::Result> { - self.termdict.get(term.serialized_value_bytes()) +impl DynInvertedIndexReader for TantivyInvertedIndexReader { + fn as_any(&self) -> &dyn Any { + self } - /// Return the term dictionary datastructure. - pub fn terms(&self) -> &TermDictionary { + fn terms(&self) -> &TermDictionary { &self.termdict } - /// Return the fields and types encoded in the dictionary in lexicographic order. - /// Only valid on JSON fields. - /// - /// Notice: This requires a full scan and therefore **very expensive**. - /// TODO: Move to sstable to use the index. - pub(crate) fn list_encoded_json_fields(&self) -> io::Result> { + fn list_encoded_json_fields(&self) -> io::Result> { let mut stream = self.termdict.stream()?; let mut fields: Vec = Vec::new(); @@ -160,136 +419,353 @@ impl InvertedIndexReader { Ok(fields) } - /// Resets the block segment to another position of the postings - /// file. - /// - /// This is useful for enumerating through a list of terms, - /// and consuming the associated posting lists while avoiding - /// reallocating a [`BlockSegmentPostings`]. - /// - /// # Warning - /// - /// This does not reset the positions list. - pub fn reset_block_postings_from_terminfo( - &self, - term_info: &TermInfo, - block_postings: &mut BlockSegmentPostings, - ) -> io::Result<()> { - let postings_slice = self - .postings_file_slice - .slice(term_info.postings_range.clone()); - let postings_bytes = postings_slice.read_bytes()?; - block_postings.reset(term_info.doc_freq, postings_bytes)?; - Ok(()) - } - - /// Returns a block postings given a `Term`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_block_postings( - &self, - term: &Term, - option: IndexRecordOption, - ) -> io::Result> { - self.get_term_info(term)? - .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) - .transpose() - } - - /// Returns a block postings given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_block_postings_from_terminfo( - &self, - term_info: &TermInfo, - requested_option: IndexRecordOption, - ) -> io::Result { - let postings_data = self - .postings_file_slice - .slice(term_info.postings_range.clone()); - BlockSegmentPostings::open( - term_info.doc_freq, - postings_data, - self.record_option, - requested_option, - ) - } - - /// Returns a posting object given a `term_info`. - /// This method is for an advanced usage only. - /// - /// Most users should prefer using [`Self::read_postings()`] instead. - pub fn read_postings_from_terminfo( + fn read_raw_postings_data( &self, term_info: &TermInfo, option: IndexRecordOption, - ) -> io::Result { - let option = option.downgrade(self.record_option); - - let block_postings = self.read_block_postings_from_terminfo(term_info, option)?; - let position_reader = { - if option.has_positions() { - let positions_data = self - .positions_file_slice - .read_bytes_slice(term_info.positions_range.clone())?; - let position_reader = PositionReader::open(positions_data)?; - Some(position_reader) - } else { - None - } - }; - Ok(SegmentPostings::from_block_postings( - block_postings, - position_reader, - )) + ) -> io::Result { + self.read_raw_postings_data_inner(term_info, option) } - /// Returns the total number of tokens recorded for all documents - /// (including deleted documents). - pub fn total_num_tokens(&self) -> u64 { + fn total_num_tokens(&self) -> u64 { self.total_num_tokens } - /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encountered and indexed. - /// - /// If the field was not indexed with the indexing options that cover - /// the requested options, the returned [`SegmentPostings`] the method does not fail - /// and returns a `SegmentPostings` with as much information as possible. - /// - /// For instance, requesting [`IndexRecordOption::WithFreqs`] for a - /// [`TextOptions`](crate::schema::TextOptions) that does not index position - /// will return a [`SegmentPostings`] with `DocId`s and frequencies. - pub fn read_postings( - &self, - term: &Term, - option: IndexRecordOption, - ) -> io::Result> { - self.get_term_info(term)? - .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) - .transpose() - } - - /// Returns the number of documents containing the term. - pub fn doc_freq(&self, term: &Term) -> io::Result { + fn doc_freq(&self, term: &Term) -> io::Result { Ok(self .get_term_info(term)? .map(|term_info| term_info.doc_freq) .unwrap_or(0u32)) } + + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + term: &'a Term, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + Ok(self + .get_term_info_async(term) + .await? + .map(|term_info| term_info.doc_freq) + .unwrap_or(0u32)) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + self.fieldnorms_file_slice.read_bytes_async().await?; + Ok(()) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_full<'a>( + &'a self, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + self.postings_file_slice.read_bytes_async().await?; + if with_positions { + self.positions_file_slice.read_bytes_async().await?; + } + Ok(()) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + term: &'a Term, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let term_info_opt: Option = self.get_term_info_async(term).await?; + if let Some(term_info) = term_info_opt { + let postings = self + .postings_file_slice + .read_bytes_slice_async(term_info.postings_range.clone()); + if with_positions { + let positions = self + .positions_file_slice + .read_bytes_slice_async(term_info.positions_range.clone()); + futures_util::future::try_join(postings, positions).await?; + } else { + postings.await?; + } + Ok(true) + } else { + Ok(false) + } + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + terms: TermRangeBounds, + limit: Option, + with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let mut term_info = self + .get_term_range_async(terms, AlwaysMatch, limit, 0) + .await?; + + let Some(first_terminfo) = term_info.next() else { + // no key matches, nothing more to load + return Ok(false); + }; + + let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone()); + + let postings_range = + first_terminfo.postings_range.start..last_terminfo.postings_range.end; + let positions_range = + first_terminfo.positions_range.start..last_terminfo.positions_range.end; + + let postings = self + .postings_file_slice + .read_bytes_slice_async(postings_range); + if with_positions { + let positions = self + .positions_file_slice + .read_bytes_slice_async(positions_range); + futures_util::future::try_join(postings, positions).await?; + } else { + postings.await?; + } + Ok(true) + }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + Self: Sized, + { + Box::pin(async move { + // merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB + // from S3 (~80MiB/s, and 50ms latency) + const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000; + // Trigger async prefetch of relevant termdict blocks. + let _term_info_iter = self + .get_term_range_async( + (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded), + automaton.clone(), + None, + MERGE_HOLES_UNDER_BYTES, + ) + .await?; + drop(_term_info_iter); + + // Build a 2nd stream without merged holes so we only scan matching blocks. + // This assumes the storage layer caches data fetched by the first pass. + let mut stream = self.termdict.search(automaton).into_stream()?; + let posting_ranges_iter = + std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone())); + let merged_posting_ranges: Vec> = posting_ranges_iter + .coalesce(|range1, range2| { + if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start { + Ok(range1.start..range2.end) + } else { + Err((range1, range2)) + } + }) + .collect(); + + if merged_posting_ranges.is_empty() { + return Ok(false); + } + + let slices_downloaded = futures_util::stream::iter(merged_posting_ranges.into_iter()) + .map(|posting_slice| { + self.postings_file_slice + .read_bytes_slice_async(posting_slice) + .map(|result| result.map(|_slice| ())) + }) + .buffer_unordered(5) + .try_collect::>() + .await?; + + Ok(!slices_downloaded.is_empty()) + }) + } +} + +impl InvertedIndexReader for TantivyInvertedIndexReader { + type Postings = SegmentPostings; + type DocSet = SegmentPostings; + + #[inline] + fn read_postings_from_terminfo( + &self, + term_info: &TermInfo, + option: IndexRecordOption, + ) -> io::Result { + let postings_data = self.read_raw_postings_data_inner(term_info, option)?; + load_postings_from_raw_data(term_info.doc_freq, postings_data) + } + + #[inline] + fn read_docset_from_terminfo(&self, term_info: &TermInfo) -> io::Result { + let postings_data = + self.read_raw_postings_data_inner(term_info, IndexRecordOption::Basic)?; + load_postings_from_raw_data(term_info.doc_freq, postings_data) + } +} + +#[cfg(test)] +mod tests { + use std::any::TypeId; + + use super::*; + + #[derive(Default)] + struct RecordDispatch { + used_concrete_reader: bool, + used_dynamic_fallback: bool, + } + + impl TypedInvertedIndexReaderCb<()> for RecordDispatch { + fn call(&mut self, _reader: &I) { + let postings_type = TypeId::of::(); + if postings_type == TypeId::of::() { + self.used_concrete_reader = true; + } else if postings_type == TypeId::of::>() { + self.used_dynamic_fallback = true; + } else { + panic!("unexpected postings type in downcast helper test"); + } + } + } + + struct OnlyDynReader { + termdict: TermDictionary, + } + + impl Default for OnlyDynReader { + fn default() -> Self { + Self { + termdict: TermDictionary::empty(), + } + } + } + + impl DynInvertedIndexReader for OnlyDynReader { + fn as_any(&self) -> &dyn Any { + self + } + + fn terms(&self) -> &TermDictionary { + &self.termdict + } + + fn list_encoded_json_fields(&self) -> io::Result> { + Ok(Vec::new()) + } + + fn read_raw_postings_data( + &self, + _term_info: &TermInfo, + _option: IndexRecordOption, + ) -> io::Result { + unreachable!("not used in downcast helper tests") + } + + fn total_num_tokens(&self) -> u64 { + 0 + } + + fn doc_freq(&self, _term: &Term) -> io::Result { + Ok(0) + } + + #[cfg(feature = "quickwit")] + fn doc_freq_async<'a>( + &'a self, + _term: &'a Term, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(0) }) + } + + #[cfg(feature = "quickwit")] + fn warm_fieldnorms_readers<'a>( + &'a self, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(()) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings<'a>( + &'a self, + _term: &'a Term, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(false) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_range<'a>( + &'a self, + _terms: TermRangeBounds, + _limit: Option, + _with_positions: bool, + ) -> Pin> + Send + 'a>> { + Box::pin(async { Ok(false) }) + } + + #[cfg(feature = "quickwit")] + fn warm_postings_automaton<'a, A: Automaton + Clone + Send + Sync + 'static>( + &'a self, + _automaton: A, + ) -> Pin> + Send + 'a>> + where + A::State: Clone + Send, + { + Box::pin(async { Ok(false) }) + } + } + + #[test] + fn try_downcast_and_call_uses_tantivy_reader() { + let reader = TantivyInvertedIndexReader::empty(IndexRecordOption::Basic); + let mut dispatch_recorder = RecordDispatch::default(); + + try_downcast_and_call(&reader, &mut dispatch_recorder); + + assert!(dispatch_recorder.used_concrete_reader); + assert!(!dispatch_recorder.used_dynamic_fallback); + } + + #[test] + fn try_downcast_and_call_uses_dynamic_fallback_for_other_readers() { + let reader = OnlyDynReader::default(); + let mut dispatch_recorder = RecordDispatch::default(); + + try_downcast_and_call(&reader, &mut dispatch_recorder); + + assert!(!dispatch_recorder.used_concrete_reader); + assert!(dispatch_recorder.used_dynamic_fallback); + } } #[cfg(feature = "quickwit")] -impl InvertedIndexReader { +impl TantivyInvertedIndexReader { pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result> { self.termdict.get_async(term.serialized_value_bytes()).await } async fn get_term_range_async<'a, A: Automaton + 'a>( &'a self, - terms: impl std::ops::RangeBounds, + terms: TermRangeBounds, automaton: A, limit: Option, merge_holes_under_bytes: usize, @@ -297,17 +773,17 @@ impl InvertedIndexReader { where A::State: Clone, { - use std::ops::Bound; let range_builder = self.termdict.search(automaton); - let range_builder = match terms.start_bound() { - Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()), - Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()), - Bound::Unbounded => range_builder, + let (start_bound, end_bound) = terms; + let range_builder = match start_bound { + std::ops::Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()), + std::ops::Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()), + std::ops::Bound::Unbounded => range_builder, }; - let range_builder = match terms.end_bound() { - Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()), - Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()), - Bound::Unbounded => range_builder, + let range_builder = match end_bound { + std::ops::Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()), + std::ops::Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()), + std::ops::Bound::Unbounded => range_builder, }; let range_builder = if let Some(limit) = limit { range_builder.limit(limit) @@ -328,167 +804,4 @@ impl InvertedIndexReader { Ok(iter) } - - /// Warmup a block postings given a `Term`. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether the term was found in the dictionary - pub async fn warm_postings(&self, term: &Term, with_positions: bool) -> io::Result { - let term_info_opt: Option = self.get_term_info_async(term).await?; - if let Some(term_info) = term_info_opt { - let postings = self - .postings_file_slice - .read_bytes_slice_async(term_info.postings_range.clone()); - if with_positions { - let positions = self - .positions_file_slice - .read_bytes_slice_async(term_info.positions_range.clone()); - futures_util::future::try_join(postings, positions).await?; - } else { - postings.await?; - } - Ok(true) - } else { - Ok(false) - } - } - - /// Warmup a block postings given a range of `Term`s. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether a term matching the range was found in the dictionary - pub async fn warm_postings_range( - &self, - terms: impl std::ops::RangeBounds, - limit: Option, - with_positions: bool, - ) -> io::Result { - let mut term_info = self - .get_term_range_async(terms, AlwaysMatch, limit, 0) - .await?; - - let Some(first_terminfo) = term_info.next() else { - // no key matches, nothing more to load - return Ok(false); - }; - - let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone()); - - let postings_range = first_terminfo.postings_range.start..last_terminfo.postings_range.end; - let positions_range = - first_terminfo.positions_range.start..last_terminfo.positions_range.end; - - let postings = self - .postings_file_slice - .read_bytes_slice_async(postings_range); - if with_positions { - let positions = self - .positions_file_slice - .read_bytes_slice_async(positions_range); - futures_util::future::try_join(postings, positions).await?; - } else { - postings.await?; - } - Ok(true) - } - - /// Warmup a block postings given a range of `Term`s. - /// This method is for an advanced usage only. - /// - /// returns a boolean, whether a term matching the range was found in the dictionary - pub async fn warm_postings_automaton< - A: Automaton + Clone + Send + 'static, - E: FnOnce(Box io::Result<()> + Send>) -> F, - F: std::future::Future>, - >( - &self, - automaton: A, - // with_positions: bool, at the moment we have no use for it, and supporting it would add - // complexity to the coalesce - executor: E, - ) -> io::Result - where - A::State: Clone, - { - // merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB from - // S3 (~80MiB/s, and 50ms latency) - const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000; - // we build a first iterator to download everything. Simply calling the function already - // download everything we need from the sstable, but doesn't start iterating over it. - let _term_info_iter = self - .get_term_range_async(.., automaton.clone(), None, MERGE_HOLES_UNDER_BYTES) - .await?; - - let (sender, posting_ranges_to_load_stream) = futures_channel::mpsc::unbounded(); - let termdict = self.termdict.clone(); - let cpu_bound_task = move || { - // then we build a 2nd iterator, this one with no holes, so we don't go through blocks - // we can't match. - // This makes the assumption there is a caching layer below us, which gives sync read - // for free after the initial async access. This might not always be true, but is in - // Quickwit. - // We build things from this closure otherwise we get into lifetime issues that can only - // be solved with self referential strucs. Returning an io::Result from here is a bit - // more leaky abstraction-wise, but a lot better than the alternative - let mut stream = termdict.search(automaton).into_stream()?; - - // we could do without an iterator, but this allows us access to coalesce which simplify - // things - let posting_ranges_iter = - std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone())); - - let merged_posting_ranges_iter = posting_ranges_iter.coalesce(|range1, range2| { - if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start { - Ok(range1.start..range2.end) - } else { - Err((range1, range2)) - } - }); - - for posting_range in merged_posting_ranges_iter { - if let Err(_) = sender.unbounded_send(posting_range) { - // this should happen only when search is cancelled - return Err(io::Error::other("failed to send posting range back")); - } - } - Ok(()) - }; - let task_handle = executor(Box::new(cpu_bound_task)); - - let posting_downloader = posting_ranges_to_load_stream - .map(|posting_slice| { - self.postings_file_slice - .read_bytes_slice_async(posting_slice) - .map(|result| result.map(|_slice| ())) - }) - .buffer_unordered(5) - .try_collect::>(); - - let (_, slices_downloaded) = - futures_util::future::try_join(task_handle, posting_downloader).await?; - - Ok(!slices_downloaded.is_empty()) - } - - /// Warmup the block postings for all terms. - /// This method is for an advanced usage only. - /// - /// If you know which terms to pre-load, prefer using [`Self::warm_postings`] or - /// [`Self::warm_postings`] instead. - pub async fn warm_postings_full(&self, with_positions: bool) -> io::Result<()> { - self.postings_file_slice.read_bytes_async().await?; - if with_positions { - self.positions_file_slice.read_bytes_async().await?; - } - Ok(()) - } - - /// Returns the number of documents containing the term asynchronously. - pub async fn doc_freq_async(&self, term: &Term) -> io::Result { - Ok(self - .get_term_info_async(term) - .await? - .map(|term_info| term_info.doc_freq) - .unwrap_or(0u32)) - } } diff --git a/src/index/mod.rs b/src/index/mod.rs index 76dc3ed9b..cab6307da 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -13,8 +13,12 @@ mod segment_reader; pub use self::index::{Index, IndexBuilder}; pub(crate) use self::index_meta::SegmentMetaInventory; pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta}; -pub use self::inverted_index_reader::InvertedIndexReader; +pub(crate) use self::inverted_index_reader::load_postings_from_terminfo; +pub use self::inverted_index_reader::{ + try_downcast_and_call, DynInvertedIndexReader, InvertedIndexFieldSpace, InvertedIndexReader, + TantivyInvertedIndexReader, TypedInvertedIndexReaderCb, +}; pub use self::segment::Segment; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; -pub use self::segment_reader::{FieldMetadata, SegmentReader}; +pub use self::segment_reader::{FieldMetadata, SegmentReader, TantivySegmentReader}; diff --git a/src/index/segment.rs b/src/index/segment.rs index fcd32a1ff..0815e0aec 100644 --- a/src/index/segment.rs +++ b/src/index/segment.rs @@ -16,7 +16,7 @@ pub struct Segment { } impl fmt::Debug for Segment { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Segment({:?})", self.id().uuid_string()) } } diff --git a/src/index/segment_id.rs b/src/index/segment_id.rs index e66aa95a9..46b4e646f 100644 --- a/src/index/segment_id.rs +++ b/src/index/segment_id.rs @@ -44,7 +44,7 @@ fn create_uuid() -> Uuid { } impl SegmentId { - #[doc(hidden)] + /// Generates a new random `SegmentId`. pub fn generate_random() -> SegmentId { SegmentId(create_uuid()) } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index cfccc65ed..f618c53ed 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -6,17 +6,101 @@ use common::{ByteCount, HasLen}; use fnv::FnvHashMap; use itertools::Itertools; -use crate::directory::{CompositeFile, FileSlice}; +use crate::directory::{CompositeFile, Directory, FileSlice}; use crate::error::DataCorruption; use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; -use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; +use crate::index::{ + DynInvertedIndexReader, Segment, SegmentComponent, SegmentId, SegmentMeta, + TantivyInvertedIndexReader, +}; use crate::json_utils::json_path_sep_to_dot; +use crate::postings::SegmentPostings; +use crate::query::boolean_query::block_wand::{block_wand, block_wand_single_scorer}; +use crate::query::term_query::TermScorer; +use crate::query::{BufferedUnionScorer, Scorer, SumCombiner}; use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::space_usage::SegmentSpaceUsage; -use crate::store::StoreReader; +use crate::store::{StoreReader, TantivyStoreReader}; use crate::termdict::TermDictionary; -use crate::{DocId, Opstamp}; +use crate::{DocId, DocSet as _, Opstamp, Score, TERMINATED}; + +/// Trait defining the contract for a segment reader. +pub trait SegmentReader: Send + Sync { + /// Returns the highest document id ever attributed in this segment + 1. + fn max_doc(&self) -> DocId; + + /// Returns the number of alive documents. Deleted documents are not counted. + fn num_docs(&self) -> DocId; + + /// Returns the schema of the index this segment belongs to. + fn schema(&self) -> &Schema; + + /// Performs a for_each_pruning operation on the given scorer. + fn for_each_pruning( + &self, + threshold: Score, + scorer: Box, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ); + + /// Return the number of documents that have been deleted in the segment. + fn num_deleted_docs(&self) -> DocId; + + /// Returns true if some of the documents of the segment have been deleted. + fn has_deletes(&self) -> bool; + + /// Accessor to a segment's fast field reader given a field. + fn fast_fields(&self) -> &FastFieldReaders; + + /// Accessor to the `FacetReader` associated with a given `Field`. + fn facet_reader(&self, field_name: &str) -> crate::Result { + let field = self.schema().get_field(field_name)?; + let field_entry = self.schema().get_field_entry(field); + if field_entry.field_type().value_type() != Type::Facet { + return Err(crate::TantivyError::SchemaError(format!( + "`{field_name}` is not a facet field.`" + ))); + } + let Some(facet_column) = self.fast_fields().str(field_name)? else { + panic!("Facet Field `{field_name}` is missing. This should not happen"); + }; + Ok(FacetReader::new(facet_column)) + } + + /// Accessor to the segment's `Field norms`'s reader. + fn get_fieldnorms_reader(&self, field: Field) -> crate::Result; + + /// Accessor to the segment's [`StoreReader`](crate::store::StoreReader). + fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result>; + + /// Returns a field reader associated with the field given in argument. + fn inverted_index(&self, field: Field) -> crate::Result>; + + /// Returns the list of fields that have been indexed in the segment. + fn fields_metadata(&self) -> crate::Result>; + + /// Returns the segment id. + fn segment_id(&self) -> SegmentId; + + /// Returns the delete opstamp. + fn delete_opstamp(&self) -> Option; + + /// Returns the bitset representing the alive `DocId`s. + fn alive_bitset(&self) -> Option<&AliveBitSet>; + + /// Returns true if the `doc` is marked as deleted. + fn is_deleted(&self, doc: DocId) -> bool; + + /// Returns an iterator that will iterate over the alive document ids. + fn doc_ids_alive(&self) -> Box + Send + '_>; + + /// Summarize total space usage of this segment. + fn space_usage(&self) -> io::Result; + + /// Clones this reader into a shared trait object. + fn clone_arc(&self) -> Arc; +} /// Entry point to access all of the datastructures of the `Segment` /// @@ -29,8 +113,8 @@ use crate::{DocId, Opstamp}; /// The segment reader has a very low memory footprint, /// as close to all of the memory data is mmapped. #[derive(Clone)] -pub struct SegmentReader { - inv_idx_reader_cache: Arc>>>, +pub struct TantivySegmentReader { + inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, delete_opstamp: Option, @@ -49,73 +133,157 @@ pub struct SegmentReader { schema: Schema, } -impl SegmentReader { - /// Returns the highest document id ever attributed in - /// this segment + 1. - pub fn max_doc(&self) -> DocId { +impl TantivySegmentReader { + /// Open a new segment for reading. + pub fn open(segment: &Segment) -> crate::Result> { + Self::open_with_custom_alive_set(segment, None) + } + + /// Open a new segment for reading. + pub fn open_with_custom_alive_set( + segment: &Segment, + custom_bitset: Option, + ) -> crate::Result> { + let reader = Self::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + custom_bitset, + )?; + Ok(Arc::new(reader)) + } + + pub(crate) fn open_with_custom_alive_set_from_directory( + directory: &dyn Directory, + segment_meta: &SegmentMeta, + schema: Schema, + custom_bitset: Option, + ) -> crate::Result { + let termdict_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Terms))?; + let termdict_composite = CompositeFile::open(&termdict_file)?; + + let store_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Store))?; + + crate::fail_point!("SegmentReader::open#middle"); + + let postings_file = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Postings))?; + let postings_composite = CompositeFile::open(&postings_file)?; + + let positions_composite = { + if let Ok(positions_file) = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Positions)) + { + CompositeFile::open(&positions_file)? + } else { + CompositeFile::empty() + } + }; + + let fast_fields_data = + directory.open_read(&segment_meta.relative_path(SegmentComponent::FastFields))?; + let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; + let fieldnorm_data = + directory.open_read(&segment_meta.relative_path(SegmentComponent::FieldNorms))?; + let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; + + let original_bitset = if segment_meta.has_deletes() { + let alive_doc_file_slice = + directory.open_read(&segment_meta.relative_path(SegmentComponent::Delete))?; + let alive_doc_data = alive_doc_file_slice.read_bytes()?; + Some(AliveBitSet::open(alive_doc_data)) + } else { + None + }; + + let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset); + + let max_doc = segment_meta.max_doc(); + let num_docs = alive_bitset_opt + .as_ref() + .map(|alive_bitset| alive_bitset.num_alive_docs() as u32) + .unwrap_or(max_doc); + + Ok(TantivySegmentReader { + inv_idx_reader_cache: Default::default(), + num_docs, + max_doc, + termdict_composite, + postings_composite, + fast_fields_readers, + fieldnorm_readers, + segment_id: segment_meta.id(), + delete_opstamp: segment_meta.delete_opstamp(), + store_file, + alive_bitset_opt, + positions_composite, + schema, + }) + } +} + +impl SegmentReader for TantivySegmentReader { + fn max_doc(&self) -> DocId { self.max_doc } - /// Returns the number of alive documents. - /// Deleted documents are not counted. - pub fn num_docs(&self) -> DocId { + fn num_docs(&self) -> DocId { self.num_docs } - /// Returns the schema of the index this segment belongs to. - pub fn schema(&self) -> &Schema { + fn schema(&self) -> &Schema { &self.schema } - /// Return the number of documents that have been - /// deleted in the segment. - pub fn num_deleted_docs(&self) -> DocId { + fn for_each_pruning( + &self, + mut threshold: Score, + mut scorer: Box, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + // Try WAND acceleration with concrete postings types + scorer = match scorer.downcast::>() { + Ok(term_scorer) => { + block_wand_single_scorer(*term_scorer, threshold, callback); + return; + } + Err(scorer) => scorer, + }; + match scorer.downcast::, SumCombiner>>() { + Ok(mut union_scorer) => { + let doc = union_scorer.doc(); + if doc == TERMINATED { + return; + } + let score = union_scorer.score(); + if score > threshold { + threshold = callback(doc, score); + } + let scorers: Vec> = union_scorer.into_scorers(); + block_wand(scorers, threshold, callback); + } + Err(mut scorer) => { + // No acceleration available. Fall back to default. + scorer.for_each_pruning(threshold, callback); + } + } + } + + fn num_deleted_docs(&self) -> DocId { self.max_doc - self.num_docs } - /// Returns true if some of the documents of the segment have been deleted. - pub fn has_deletes(&self) -> bool { - self.num_deleted_docs() > 0 + fn has_deletes(&self) -> bool { + self.num_docs != self.max_doc } - /// Accessor to a segment's fast field reader given a field. - /// - /// Returns the u64 fast value reader if the field - /// is a u64 field indexed as "fast". - /// - /// Return a FastFieldNotAvailableError if the field is not - /// declared as a fast field in the schema. - /// - /// # Panics - /// May panic if the index is corrupted. - pub fn fast_fields(&self) -> &FastFieldReaders { + fn fast_fields(&self) -> &FastFieldReaders { &self.fast_fields_readers } - /// Accessor to the `FacetReader` associated with a given `Field`. - pub fn facet_reader(&self, field_name: &str) -> crate::Result { - let schema = self.schema(); - let field = schema.get_field(field_name)?; - let field_entry = schema.get_field_entry(field); - if field_entry.field_type().value_type() != Type::Facet { - return Err(crate::TantivyError::SchemaError(format!( - "`{field_name}` is not a facet field.`" - ))); - } - let Some(facet_column) = self.fast_fields().str(field_name)? else { - panic!("Facet Field `{field_name}` is missing. This should not happen"); - }; - Ok(FacetReader::new(facet_column)) - } - - /// Accessor to the segment's `Field norms`'s reader. - /// - /// Field norms are the length (in tokens) of the fields. - /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). - /// - /// They are simply stored as a fast field, serialized in - /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result { + fn get_fieldnorms_reader(&self, field: Field) -> crate::Result { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( @@ -126,100 +294,14 @@ impl SegmentReader { }) } - #[doc(hidden)] - pub fn fieldnorms_readers(&self) -> &FieldNormReaders { - &self.fieldnorm_readers + fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result> { + Ok(Box::new(TantivyStoreReader::open( + self.store_file.clone(), + cache_num_blocks, + )?)) } - /// Accessor to the segment's [`StoreReader`](crate::store::StoreReader). - /// - /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU. - /// The size of blocks is configurable, this should be reflexted in the - pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result { - StoreReader::open(self.store_file.clone(), cache_num_blocks) - } - - /// Open a new segment for reading. - pub fn open(segment: &Segment) -> crate::Result { - Self::open_with_custom_alive_set(segment, None) - } - - /// Open a new segment for reading. - pub fn open_with_custom_alive_set( - segment: &Segment, - custom_bitset: Option, - ) -> crate::Result { - let termdict_file = segment.open_read(SegmentComponent::Terms)?; - let termdict_composite = CompositeFile::open(&termdict_file)?; - - let store_file = segment.open_read(SegmentComponent::Store)?; - - crate::fail_point!("SegmentReader::open#middle"); - - let postings_file = segment.open_read(SegmentComponent::Postings)?; - let postings_composite = CompositeFile::open(&postings_file)?; - - let positions_composite = { - if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) { - CompositeFile::open(&positions_file)? - } else { - CompositeFile::empty() - } - }; - - let schema = segment.schema(); - - let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?; - let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?; - let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; - let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; - - let original_bitset = if segment.meta().has_deletes() { - let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?; - let alive_doc_data = alive_doc_file_slice.read_bytes()?; - Some(AliveBitSet::open(alive_doc_data)) - } else { - None - }; - - let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset); - - let max_doc = segment.meta().max_doc(); - let num_docs = alive_bitset_opt - .as_ref() - .map(|alive_bitset| alive_bitset.num_alive_docs() as u32) - .unwrap_or(max_doc); - - Ok(SegmentReader { - inv_idx_reader_cache: Default::default(), - num_docs, - max_doc, - termdict_composite, - postings_composite, - fast_fields_readers, - fieldnorm_readers, - segment_id: segment.id(), - delete_opstamp: segment.meta().delete_opstamp(), - store_file, - alive_bitset_opt, - positions_composite, - schema, - }) - } - - /// Returns a field reader associated with the field given in argument. - /// If the field was not present in the index during indexing time, - /// the InvertedIndexReader is empty. - /// - /// The field reader is in charge of iterating through the - /// term dictionary associated with a specific field, - /// and opening the posting list associated with any term. - /// - /// If the field is not marked as index, a warning is logged and an empty `InvertedIndexReader` - /// is returned. - /// Similarly, if the field is marked as indexed but no term has been indexed for the given - /// index, an empty `InvertedIndexReader` is returned (but no warning is logged). - pub fn inverted_index(&self, field: Field) -> crate::Result> { + fn inverted_index(&self, field: Field) -> crate::Result> { if let Some(inv_idx_reader) = self .inv_idx_reader_cache .read() @@ -244,7 +326,9 @@ impl SegmentReader { // // Returns an empty inverted index. let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic); - return Ok(Arc::new(InvertedIndexReader::empty(record_option))); + let inv_idx_reader: Arc = + Arc::new(TantivyInvertedIndexReader::empty(record_option)); + return Ok(inv_idx_reader); } let record_option = record_option_opt.unwrap(); @@ -267,13 +351,20 @@ impl SegmentReader { ); DataCorruption::comment_only(error_msg) })?; + let fieldnorms_file = self + .fieldnorm_readers + .get_inner_file() + .open_read(field) + .unwrap_or_else(FileSlice::empty); - let inv_idx_reader = Arc::new(InvertedIndexReader::new( - TermDictionary::open(termdict_file)?, - postings_file, - positions_file, - record_option, - )?); + let inv_idx_reader: Arc = + Arc::new(TantivyInvertedIndexReader::new( + TermDictionary::open(termdict_file)?, + postings_file, + positions_file, + fieldnorms_file, + record_option, + )?); // by releasing the lock in between, we may end up opening the inverting index // twice, but this is fine. @@ -285,23 +376,10 @@ impl SegmentReader { Ok(inv_idx_reader) } - /// Returns the list of fields that have been indexed in the segment. - /// The field list includes the field defined in the schema as well as the fields - /// that have been indexed as a part of a JSON field. - /// The returned field name is the full field name, including the name of the JSON field. - /// - /// The returned field names can be used in queries. - /// - /// Notice: If your data contains JSON fields this is **very expensive**, as it requires - /// browsing through the inverted index term dictionary and the columnar field dictionary. - /// - /// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json - /// field that is not indexed nor a fast field but is stored, it is possible for the field - /// to not be listed. - pub fn fields_metadata(&self) -> crate::Result> { + fn fields_metadata(&self) -> crate::Result> { let mut indexed_fields: Vec = Vec::new(); let mut map_to_canonical = FnvHashMap::default(); - for (field, field_entry) in self.schema().fields() { + for (field, field_entry) in self.schema.fields() { let field_name = field_entry.name().to_string(); let is_indexed = field_entry.is_indexed(); if is_indexed { @@ -391,7 +469,7 @@ impl SegmentReader { } } let fast_fields: Vec = self - .fast_fields() + .fast_fields_readers .columnar() .iter_columns()? .map(|(mut field_name, handle)| { @@ -419,31 +497,26 @@ impl SegmentReader { Ok(merged_field_metadatas) } - /// Returns the segment id - pub fn segment_id(&self) -> SegmentId { + fn segment_id(&self) -> SegmentId { self.segment_id } - /// Returns the delete opstamp - pub fn delete_opstamp(&self) -> Option { + fn delete_opstamp(&self) -> Option { self.delete_opstamp } - /// Returns the bitset representing the alive `DocId`s. - pub fn alive_bitset(&self) -> Option<&AliveBitSet> { + fn alive_bitset(&self) -> Option<&AliveBitSet> { self.alive_bitset_opt.as_ref() } - /// Returns true if the `doc` is marked - /// as deleted. - pub fn is_deleted(&self, doc: DocId) -> bool { - self.alive_bitset() + fn is_deleted(&self, doc: DocId) -> bool { + self.alive_bitset_opt + .as_ref() .map(|alive_bitset| alive_bitset.is_deleted(doc)) .unwrap_or(false) } - /// Returns an iterator that will iterate over the alive document ids - pub fn doc_ids_alive(&self) -> Box + Send + '_> { + fn doc_ids_alive(&self) -> Box + Send + '_> { if let Some(alive_bitset) = &self.alive_bitset_opt { Box::new(alive_bitset.iter_alive()) } else { @@ -451,22 +524,25 @@ impl SegmentReader { } } - /// Summarize total space usage of this segment. - pub fn space_usage(&self) -> io::Result { + fn space_usage(&self) -> io::Result { Ok(SegmentSpaceUsage::new( - self.num_docs(), - self.termdict_composite.space_usage(self.schema()), - self.postings_composite.space_usage(self.schema()), - self.positions_composite.space_usage(self.schema()), + self.num_docs, + self.termdict_composite.space_usage(&self.schema), + self.postings_composite.space_usage(&self.schema), + self.positions_composite.space_usage(&self.schema), self.fast_fields_readers.space_usage()?, - self.fieldnorm_readers.space_usage(self.schema()), - self.get_store_reader(0)?.space_usage(), + self.fieldnorm_readers.space_usage(&self.schema), + TantivyStoreReader::open(self.store_file.clone(), 0)?.space_usage(), self.alive_bitset_opt .as_ref() .map(AliveBitSet::space_usage) .unwrap_or_default(), )) } + + fn clone_arc(&self) -> Arc { + Arc::new(self.clone()) + } } #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -576,7 +652,7 @@ fn intersect_alive_bitset( } } -impl fmt::Debug for SegmentReader { +impl fmt::Debug for TantivySegmentReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "SegmentReader({:?})", self.segment_id) } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 1a269caed..9a455f9b1 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -250,11 +250,15 @@ mod tests { struct DummyWeight; impl Weight for DummyWeight { - fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result> { + fn scorer( + &self, + _reader: &dyn SegmentReader, + _boost: Score, + ) -> crate::Result> { Err(crate::TantivyError::InternalError("dummy impl".to_owned())) } - fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result { + fn explain(&self, _reader: &dyn SegmentReader, _doc: DocId) -> crate::Result { Err(crate::TantivyError::InternalError("dummy impl".to_owned())) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 7ffc38615..d07ff1eb4 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -12,7 +12,9 @@ use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit}; use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite}; use crate::error::TantivyError; use crate::fastfield::write_alive_bitset; -use crate::index::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader}; +use crate::index::{ + Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader, TantivySegmentReader, +}; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::index_writer_status::IndexWriterStatus; @@ -94,7 +96,7 @@ pub struct IndexWriter { fn compute_deleted_bitset( alive_bitset: &mut BitSet, - segment_reader: &SegmentReader, + segment_reader: &dyn SegmentReader, delete_cursor: &mut DeleteCursor, doc_opstamps: &DocToOpstampMapping, target_opstamp: Opstamp, @@ -143,7 +145,13 @@ pub fn advance_deletes( return Ok(()); } - let segment_reader = SegmentReader::open(&segment)?; + let segment_reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let segment_reader: Arc = Arc::new(segment_reader); let max_doc = segment_reader.max_doc(); let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { @@ -155,7 +163,7 @@ pub fn advance_deletes( compute_deleted_bitset( &mut alive_bitset, - &segment_reader, + segment_reader.as_ref(), segment_entry.delete_cursor(), &DocToOpstampMapping::None, target_opstamp, @@ -243,14 +251,20 @@ fn apply_deletes( .max() .expect("Empty DocOpstamp is forbidden"); - let segment_reader = SegmentReader::open(segment)?; + let segment_reader = TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + let segment_reader: Arc = Arc::new(segment_reader); let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let max_doc = segment.meta().max_doc(); let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, - &segment_reader, + segment_reader.as_ref(), delete_cursor, &doc_to_opstamps, max_doc_opstamp, @@ -1965,9 +1979,9 @@ mod tests { .get_store_reader(DOCSTORE_CACHE_CAPACITY) .unwrap(); // test store iterator - for doc in store_reader.iter::(segment_reader.alive_bitset()) { + for doc_id in segment_reader.doc_ids_alive() { + let doc = store_reader.get(doc_id).unwrap(); let id = doc - .unwrap() .get_first(id_field) .unwrap() .as_value() @@ -1978,7 +1992,7 @@ mod tests { // test store random access for doc_id in segment_reader.doc_ids_alive() { let id = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(id_field) .unwrap() @@ -1987,7 +2001,7 @@ mod tests { assert!(expected_ids_and_num_occurrences.contains_key(&id)); if id_is_full_doc(id) { let id2 = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(multi_numbers) .unwrap() @@ -1995,13 +2009,13 @@ mod tests { .unwrap(); assert_eq!(id, id2); let bool = store_reader - .get::(doc_id) + .get(doc_id) .unwrap() .get_first(bool_field) .unwrap() .as_bool() .unwrap(); - let doc = store_reader.get::(doc_id).unwrap(); + let doc = store_reader.get(doc_id).unwrap(); let mut bool2 = doc.get_all(multi_bools); assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap()); diff --git a/src/indexer/merge_index_test.rs b/src/indexer/merge_index_test.rs index 43f80a9d0..a1aaad58f 100644 --- a/src/indexer/merge_index_test.rs +++ b/src/indexer/merge_index_test.rs @@ -3,7 +3,7 @@ mod tests { use crate::collector::TopDocs; use crate::fastfield::AliveBitSet; use crate::index::Index; - use crate::postings::Postings; + use crate::postings::{DocFreq, Postings}; use crate::query::QueryParser; use crate::schema::{ self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, @@ -121,21 +121,32 @@ mod tests { let my_text_field = index.schema().get_field("text_field").unwrap(); let term_a = Term::from_field_text(my_text_field, "text"); let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); - let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) - .unwrap() - .unwrap(); - assert_eq!(postings.doc_freq(), 2); + let term_info = inverted_index.get_term_info(&term_a).unwrap().unwrap(); + let postings_for_test = crate::index::load_postings_from_terminfo( + inverted_index.as_ref(), + &term_info, + IndexRecordOption::WithFreqsAndPositions, + ) + .unwrap(); let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); assert_eq!( - postings.doc_freq_given_deletes( + crate::indexer::merger::doc_freq_given_deletes( + postings_for_test, segment_reader.alive_bitset().unwrap_or(&fallback_bitset) ), 2 ); + let postings = inverted_index + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap(); + assert_eq!(postings.unwrap().doc_freq(), DocFreq::Exact(2)); + let postings = inverted_index + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap(); + let mut postings = postings.unwrap(); assert_eq!(postings.term_freq(), 1); - let mut output = vec![]; + let mut output = Vec::new(); postings.positions(&mut output); assert_eq!(output, vec![1]); postings.advance(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 47ac5a55b..31912a4c2 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,3 +1,4 @@ +use std::io; use std::sync::Arc; use columnar::{ @@ -15,11 +16,11 @@ use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, use crate::index::{Segment, SegmentComponent, SegmentReader}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::SegmentSerializer; -use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; -use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; +use crate::postings::{InvertedIndexSerializer, Postings, TermInfo}; +use crate::schema::{value_type_to_column_type, Field, FieldType, IndexRecordOption, Schema}; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; -use crate::{DocAddress, DocId, InvertedIndexReader}; +use crate::{DocAddress, DocId, DynInvertedIndexReader}; /// Segment's max doc must be `< MAX_DOC_LIMIT`. /// @@ -27,7 +28,7 @@ use crate::{DocAddress, DocId, InvertedIndexReader}; pub const MAX_DOC_LIMIT: u32 = 1 << 31; fn estimate_total_num_tokens_in_single_segment( - reader: &SegmentReader, + reader: &dyn SegmentReader, field: Field, ) -> crate::Result { // There are no deletes. We can simply use the exact value saved into the posting list. @@ -39,7 +40,7 @@ fn estimate_total_num_tokens_in_single_segment( // When there are deletes, we use an approximation either // by using the fieldnorm. - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { let mut count: [usize; 256] = [0; 256]; for doc in reader.doc_ids_alive() { let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); @@ -68,17 +69,20 @@ fn estimate_total_num_tokens_in_single_segment( Ok((segment_num_tokens as f64 * ratio) as u64) } -fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { +fn estimate_total_num_tokens( + readers: &[Arc], + field: Field, +) -> crate::Result { let mut total_num_tokens: u64 = 0; for reader in readers { - total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?; + total_num_tokens += estimate_total_num_tokens_in_single_segment(reader.as_ref(), field)?; } Ok(total_num_tokens) } pub struct IndexMerger { schema: Schema, - pub(crate) readers: Vec, + pub(crate) readers: Vec>, max_doc: u32, } @@ -162,16 +166,25 @@ impl IndexMerger { // This can be used to merge but also apply an additional filter. // One use case is demux, which is basically taking a list of // segments and partitions them e.g. by a value in a field. + // + // # Panics if segments is empty. pub fn open_with_custom_alive_set( schema: Schema, segments: &[Segment], alive_bitset_opt: Vec>, ) -> crate::Result { + assert!(!segments.is_empty()); let mut readers = vec![]; for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) { if segment.meta().num_docs() > 0 { let reader = - SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?; + crate::TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + new_alive_bitset_opt, + )?; + let reader: Arc = Arc::new(reader); readers.push(reader); } } @@ -262,7 +275,7 @@ impl IndexMerger { }), ); - let has_deletes: bool = self.readers.iter().any(SegmentReader::has_deletes); + let has_deletes: bool = self.readers.iter().any(|reader| reader.has_deletes()); let mapping_type = if has_deletes { MappingType::StackedWithDeletes } else { @@ -297,7 +310,7 @@ impl IndexMerger { let mut max_term_ords: Vec = Vec::new(); - let field_readers: Vec> = self + let field_readers: Vec> = self .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) @@ -355,7 +368,8 @@ impl IndexMerger { indexed. Have you modified the schema?", ); - let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![]; + let mut segment_postings_containing_the_term: Vec<(usize, Box)> = + Vec::with_capacity(self.readers.len()); while merged_terms.advance() { segment_postings_containing_the_term.clear(); @@ -366,18 +380,15 @@ impl IndexMerger { // Let's compute the list of non-empty posting lists for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() { let segment_reader = &self.readers[segment_ord]; - let inverted_index: &InvertedIndexReader = &field_readers[segment_ord]; - let segment_postings = inverted_index - .read_postings_from_terminfo(&term_info, segment_postings_option)?; - let alive_bitset_opt = segment_reader.alive_bitset(); - let doc_freq = if let Some(alive_bitset) = alive_bitset_opt { - segment_postings.doc_freq_given_deletes(alive_bitset) - } else { - segment_postings.doc_freq() - }; - if doc_freq > 0u32 { + let inverted_index = &field_readers[segment_ord]; + if let Some((doc_freq, postings)) = postings_for_merge( + inverted_index.as_ref(), + &term_info, + segment_postings_option, + segment_reader.alive_bitset(), + )? { total_doc_freq += doc_freq; - segment_postings_containing_the_term.push((segment_ord, segment_postings)); + segment_postings_containing_the_term.push((segment_ord, postings)); } } @@ -395,11 +406,7 @@ impl IndexMerger { assert!(!segment_postings_containing_the_term.is_empty()); let has_term_freq = { - let has_term_freq = !segment_postings_containing_the_term[0] - .1 - .block_cursor - .freqs() - .is_empty(); + let has_term_freq = segment_postings_containing_the_term[0].1.has_freq(); for (_, postings) in &segment_postings_containing_the_term[1..] { // This may look at a strange way to test whether we have term freq or not. // With JSON object, the schema is not sufficient to know whether a term @@ -415,7 +422,7 @@ impl IndexMerger { // // Overall the reliable way to know if we have actual frequencies loaded or not // is to check whether the actual decoded array is empty or not. - if has_term_freq == postings.block_cursor.freqs().is_empty() { + if postings.has_freq() != has_term_freq { return Err(DataCorruption::comment_only( "Term freqs are inconsistent across segments", ) @@ -490,33 +497,7 @@ impl IndexMerger { debug_time!("write-storable-fields"); debug!("write-storable-field"); - for reader in &self.readers { - let store_reader = reader.get_store_reader(1)?; - if reader.has_deletes() - // If there is not enough data in the store, we avoid stacking in order to - // avoid creating many small blocks in the doc store. Once we have 5 full blocks, - // we start stacking. In the worst case 2/7 of the blocks would be very small. - // [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}] - // => 5 * full blocks, 2 * 1 document blocks - // - // In a more realistic scenario the segments are of the same size, so 1/6 of - // the doc stores would be on average half full, given total randomness (which - // is not the case here, but not sure how it behaves exactly). - // - // https://github.com/quickwit-oss/tantivy/issues/1053 - // - // take 7 in order to not walk over all checkpoints. - || store_reader.block_checkpoints().take(7).count() < 6 - || store_reader.decompressor() != store_writer.compressor().into() - { - for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { - let doc_bytes = doc_bytes_res?; - store_writer.store_bytes(&doc_bytes)?; - } - } else { - store_writer.stack(store_reader)?; - } - } + store_writer.merge_segment_readers(&self.readers)?; Ok(()) } @@ -553,6 +534,75 @@ impl IndexMerger { } } +/// Compute the number of non-deleted documents. +/// +/// This method will scan through the posting lists, consuming them. +/// (this is a rather expensive operation). +pub(crate) fn doc_freq_given_deletes( + mut postings: Box, + alive_bitset: &AliveBitSet, +) -> u32 { + let mut doc_freq = 0; + loop { + let doc = postings.doc(); + if doc == TERMINATED { + return doc_freq; + } + if alive_bitset.is_alive(doc) { + doc_freq += 1u32; + } + postings.advance(); + } +} + +fn read_postings_for_merge( + inverted_index: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, +) -> io::Result> { + crate::index::load_postings_from_terminfo(inverted_index, term_info, option) +} + +fn postings_for_merge( + inverted_index: &dyn DynInvertedIndexReader, + term_info: &TermInfo, + option: IndexRecordOption, + alive_bitset_opt: Option<&AliveBitSet>, +) -> io::Result)>> { + // TODO: avoid loading postings twice — once for counting, once for writing + let count_postings = read_postings_for_merge(inverted_index, term_info, option)?; + let doc_freq = if let Some(alive_bitset) = alive_bitset_opt { + doc_freq_given_deletes(count_postings, alive_bitset) + } else { + // We do not need an exact document frequency here. + match count_postings.doc_freq() { + crate::postings::DocFreq::Exact(doc_freq) => doc_freq, + crate::postings::DocFreq::Approximate(_) => exact_doc_freq(count_postings), + } + }; + + if doc_freq == 0u32 { + return Ok(None); + } + + let postings = read_postings_for_merge(inverted_index, term_info, option)?; + Ok(Some((doc_freq, postings))) +} + +/// If the postings is not able to inform us of the document frequency, +/// we just scan through it. +pub(crate) fn exact_doc_freq(mut postings: Box) -> u32 { + let mut doc_freq = 0; + loop { + let doc = postings.doc(); + if doc == TERMINATED { + return doc_freq; + } + doc_freq += 1u32; + postings.advance(); + } +} + #[cfg(test)] mod tests { @@ -565,8 +615,10 @@ mod tests { BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE, }; use crate::collector::{Count, FacetCollector}; + use crate::fastfield::AliveBitSet; use crate::index::{Index, SegmentId}; use crate::indexer::NoMergePolicy; + use crate::postings::{DocFreq, Postings as _, SegmentPostings}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::schema::{ Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, @@ -681,32 +733,32 @@ mod tests { ); } { - let doc = searcher.doc::(DocAddress::new(0, 0))?; + let doc = searcher.doc(DocAddress::new(0, 0))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("af b") ); } { - let doc = searcher.doc::(DocAddress::new(0, 1))?; + let doc = searcher.doc(DocAddress::new(0, 1))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("a b c") ); } { - let doc = searcher.doc::(DocAddress::new(0, 2))?; + let doc = searcher.doc(DocAddress::new(0, 2))?; assert_eq!( doc.get_first(text_field).unwrap().as_value().as_str(), Some("a b c d") ); } { - let doc = searcher.doc::(DocAddress::new(0, 3))?; + let doc = searcher.doc(DocAddress::new(0, 3))?; assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); } { - let doc = searcher.doc::(DocAddress::new(0, 4))?; + let doc = searcher.doc(DocAddress::new(0, 4))?; assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g")); } @@ -1518,10 +1570,10 @@ mod tests { let searcher = reader.searcher(); let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(searcher.segment_reader(0u32), 1.0)? + .term_scorer_for_test(searcher.segment_reader(0u32), 1.0) .unwrap(); assert_eq!(term_scorer.doc(), 0); - assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855); + assert_nearly_equals!(term_scorer.seek_block_max(0), 0.0079681855); assert_nearly_equals!(term_scorer.score(), 0.0079681855); for _ in 0..81 { writer.add_document(doc!(text=>"hello happy tax payer"))?; @@ -1534,13 +1586,13 @@ mod tests { for segment_reader in searcher.segment_readers() { let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(segment_reader, 1.0)? + .term_scorer_for_test(segment_reader.as_ref(), 1.0) .unwrap(); // the difference compared to before is intrinsic to the bm25 formula. no worries // there. for doc in segment_reader.doc_ids_alive() { assert_eq!(term_scorer.doc(), doc); - assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312); assert_nearly_equals!(term_scorer.score(), 0.003478312); term_scorer.advance(); } @@ -1560,12 +1612,12 @@ mod tests { let segment_reader = searcher.segment_reader(0u32); let mut term_scorer = term_query .specialized_weight(EnableScoring::enabled_from_searcher(&searcher))? - .term_scorer_for_test(segment_reader, 1.0)? + .term_scorer_for_test(segment_reader, 1.0) .unwrap(); // the difference compared to before is intrinsic to the bm25 formula. no worries there. for doc in segment_reader.doc_ids_alive() { assert_eq!(term_scorer.doc(), doc); - assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312); assert_nearly_equals!(term_scorer.score(), 0.003478312); term_scorer.advance(); } @@ -1579,4 +1631,19 @@ mod tests { assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0); assert!((super::MAX_DOC_LIMIT as i32) < 0); } + + #[test] + fn test_doc_freq_given_delete() { + let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); + assert_eq!(docs.doc_freq(), DocFreq::Exact(3)); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); + let docs_boxed: Box = + Box::new(SegmentPostings::create_from_docs(&[0, 2, 10])); + assert_eq!(super::doc_freq_given_deletes(docs_boxed, &alive_bitset), 2); + let all_deleted = + AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); + let docs_boxed: Box = + Box::new(SegmentPostings::create_from_docs(&[0, 2, 10])); + assert_eq!(super::doc_freq_given_deletes(docs_boxed, &all_deleted), 0); + } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index bf35c52bc..c26f10a14 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -139,9 +139,9 @@ fn merge( /// meant to work if you have an `IndexWriter` running for the origin indices, or /// the destination `Index`. #[doc(hidden)] -pub fn merge_indices>>( +pub fn merge_indices( indices: &[Index], - output_directory: T, + output_directory: Box, ) -> crate::Result { if indices.is_empty() { // If there are no indices to merge, there is no need to do anything. @@ -211,11 +211,11 @@ pub fn merge_filtered_segments>>( )); } - let mut merged_index = Index::create( - output_directory, - target_schema.clone(), - target_settings.clone(), - )?; + let mut merged_index: Index = Index::builder() + .schema(target_schema.clone()) + .settings(target_settings.clone()) + .create(output_directory.into())?; + let merged_segment = merged_index.new_segment(); let merged_segment_id = merged_segment.id(); let merger: IndexMerger = @@ -235,7 +235,6 @@ pub fn merge_filtered_segments>>( )) .trim_end() ); - let index_meta = IndexMeta { index_settings: target_settings, // index_settings of all segments should be the same segments: vec![segment_meta], @@ -275,7 +274,7 @@ impl SegmentUpdater { stamper: Stamper, delete_cursor: &DeleteCursor, num_merge_threads: usize, - ) -> crate::Result { + ) -> crate::Result { let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); let pool = ThreadPoolBuilder::new() @@ -930,7 +929,7 @@ mod tests { #[test] fn test_merge_empty_indices_array() { - let merge_result = merge_indices(&[], RamDirectory::default()); + let merge_result = merge_indices(&[], Box::new(RamDirectory::default())); assert!(merge_result.is_err()); } @@ -957,7 +956,10 @@ mod tests { }; // mismatched schema index list - let result = merge_indices(&[first_index, second_index], RamDirectory::default()); + let result = merge_indices( + &[first_index, second_index], + Box::new(RamDirectory::default()), + ); assert!(result.is_err()); Ok(()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 94e3f0de2..44f48de0f 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -12,7 +12,7 @@ use crate::indexer::segment_serializer::SegmentSerializer; use crate::json_utils::{index_json_value, IndexingPositionsPerPath}; use crate::postings::{ compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition, - PerFieldPostingsWriter, PostingsWriter, + PerFieldPostingsWriter, PostingsWriter, PostingsWriterEnum, }; use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED}; @@ -169,7 +169,7 @@ impl SegmentWriter { } let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); - let postings_writer: &mut dyn PostingsWriter = + let postings_writer: &mut PostingsWriterEnum = self.per_field_postings_writers.get_for_field_mut(field); term_buffer.clear_with_field(field); @@ -434,7 +434,7 @@ mod tests { Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value, DATE_TIME_PRECISION_INDEXED, FAST, STORED, STRING, TEXT, }; - use crate::store::{Compressor, StoreReader, StoreWriter}; + use crate::store::{Compressor, StoreWriter, TantivyStoreReader}; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; use crate::tokenizer::{PreTokenizedString, Token}; @@ -482,8 +482,8 @@ mod tests { store_writer.store(&doc, &schema).unwrap(); store_writer.close().unwrap(); - let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); - let doc = reader.get::(0).unwrap(); + let reader = TantivyStoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); + let doc = reader.get(0).unwrap(); assert_eq!(doc.field_values().count(), 2); assert_eq!( @@ -600,16 +600,12 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); let doc = searcher - .doc::(DocAddress { + .doc(DocAddress { segment_ord: 0u32, doc_id: 0u32, }) .unwrap(); - let serdeser_json_val = serde_json::from_str::(&doc.to_json(&schema)) - .unwrap() - .get("json") - .unwrap()[0] - .clone(); + let serdeser_json_val = doc.to_json(&schema).get("json").unwrap().clone(); assert_eq!(json_val, serdeser_json_val); let segment_reader = searcher.segment_reader(0u32); let inv_idx = segment_reader.inverted_index(json_field).unwrap(); @@ -871,7 +867,7 @@ mod tests { let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0u32); - fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) { + fn assert_type(reader: &dyn SegmentReader, field: &str, typ: ColumnType) { let cols = reader.fast_fields().dynamic_column_handles(field).unwrap(); assert_eq!(cols.len(), 1, "{field}"); assert_eq!(cols[0].column_type(), typ, "{field}"); @@ -890,7 +886,7 @@ mod tests { assert_type(segment_reader, "json.my_arr", ColumnType::I64); assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str); - fn assert_empty(reader: &SegmentReader, field: &str) { + fn assert_empty(reader: &dyn SegmentReader, field: &str) { let cols = reader.fast_fields().dynamic_column_handles(field).unwrap(); assert_eq!(cols.len(), 0); } diff --git a/src/indexer/single_segment_index_writer.rs b/src/indexer/single_segment_index_writer.rs index 673accae3..342b88a88 100644 --- a/src/indexer/single_segment_index_writer.rs +++ b/src/indexer/single_segment_index_writer.rs @@ -11,7 +11,7 @@ pub struct SingleSegmentIndexWriter { segment_writer: SegmentWriter, segment: Segment, opstamp: Opstamp, - _phantom: PhantomData, + _doc: PhantomData, } impl SingleSegmentIndexWriter { @@ -22,7 +22,7 @@ impl SingleSegmentIndexWriter { segment_writer, segment, opstamp: 0, - _phantom: PhantomData, + _doc: PhantomData, }) } @@ -40,7 +40,7 @@ impl SingleSegmentIndexWriter { pub fn finalize(self) -> crate::Result { let max_doc = self.segment_writer.max_doc(); self.segment_writer.finalize()?; - let segment: Segment = self.segment.with_max_doc(max_doc); + let segment = self.segment.with_max_doc(max_doc); let index = segment.index(); let index_meta = IndexMeta { index_settings: index.settings().clone(), diff --git a/src/lib.rs b/src/lib.rs index 93f0fa43b..ee6870c9f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -93,7 +93,7 @@ //! //! for (_score, doc_address) in top_docs { //! // Retrieve the actual content of documents given its `doc_address`. -//! let retrieved_doc = searcher.doc::(doc_address)?; +//! let retrieved_doc = searcher.doc(doc_address)?; //! println!("{}", retrieved_doc.to_json(&schema)); //! } //! @@ -166,6 +166,7 @@ mod functional_test; #[macro_use] mod macros; + mod future_result; // Re-exports @@ -223,11 +224,12 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; -pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration}; +pub use crate::core::{json_utils, Executor, Searcher, SearcherContext, SearcherGeneration}; pub use crate::directory::Directory; pub use crate::index::{ - Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment, - SegmentMeta, SegmentReader, + try_downcast_and_call, DynInvertedIndexReader, Index, IndexBuilder, IndexMeta, IndexSettings, + InvertedIndexReader, Order, Segment, SegmentMeta, SegmentReader, TantivyInvertedIndexReader, + TantivySegmentReader, TypedInvertedIndexReaderCb, }; pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; pub use crate::schema::{Document, TantivyDocument, Term}; @@ -547,7 +549,7 @@ pub mod tests { index_writer.commit()?; let reader = index.reader()?; let searcher = reader.searcher(); - let segment_reader: &SegmentReader = searcher.segment_reader(0); + let segment_reader: &dyn SegmentReader = searcher.segment_reader(0); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?; assert_eq!(fieldnorms_reader.fieldnorm(0), 3); assert_eq!(fieldnorms_reader.fieldnorm(1), 0); @@ -555,7 +557,7 @@ pub mod tests { Ok(()) } - fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool { + fn advance_undeleted(docset: &mut dyn DocSet, reader: &dyn SegmentReader) -> bool { let mut doc = docset.advance(); while doc != TERMINATED { if !reader.is_deleted(doc) { @@ -1072,7 +1074,7 @@ pub mod tests { } let reader = index.reader()?; let searcher = reader.searcher(); - let segment_reader: &SegmentReader = searcher.segment_reader(0); + let segment_reader: &dyn SegmentReader = searcher.segment_reader(0); { let fast_field_reader_res = segment_reader.fast_fields().u64("text"); assert!(fast_field_reader_res.is_err()); diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 47ace9975..f527d46c0 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,26 +1,17 @@ use std::io; -use common::VInt; +use common::{OwnedBytes, VInt}; -use crate::directory::{FileSlice, OwnedBytes}; +use super::FreqReadingOption; use crate::fieldnorm::FieldNormReader; -use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; -use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; +use crate::postings::compression::{BlockDecoder, VIntDecoder as _, COMPRESSION_BLOCK_SIZE}; +use crate::postings::skip::{BlockInfo, SkipReader}; use crate::query::Bm25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; -fn max_score>(mut it: I) -> Option { - it.next().map(|first| it.fold(first, Score::max)) -} - /// `BlockSegmentPostings` is a cursor iterating over blocks /// of documents. -/// -/// # Warning -/// -/// While it is useful for some very specific high-performance -/// use cases, you should prefer using `SegmentPostings` for most usage. #[derive(Clone)] pub struct BlockSegmentPostings { pub(crate) doc_decoder: BlockDecoder, @@ -88,19 +79,18 @@ fn split_into_skips_and_postings( } impl BlockSegmentPostings { - /// Opens a `BlockSegmentPostings`. + /// Opens a `StandardPostingsReader`. /// `doc_freq` is the number of documents in the posting list. /// `record_option` represents the amount of data available according to the schema. /// `requested_option` is the amount of data requested by the user. /// If for instance, we do not request for term frequencies, this function will not decompress /// term frequency blocks. - pub(crate) fn open( + pub fn open( doc_freq: u32, - data: FileSlice, + bytes: OwnedBytes, mut record_option: IndexRecordOption, requested_option: IndexRecordOption, ) -> io::Result { - let bytes = data.read_bytes()?; let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?; let skip_reader = match skip_data_opt { Some(skip_data) => { @@ -138,6 +128,87 @@ impl BlockSegmentPostings { block_segment_postings.load_block(); Ok(block_segment_postings) } +} + +fn max_score>(mut it: I) -> Option { + it.next().map(|first| it.fold(first, Score::max)) +} + +impl BlockSegmentPostings { + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. + /// + /// This `doc_freq` is simply the sum of the length of all of the blocks + /// length, and it does not take in account deleted documents. + pub fn doc_freq(&self) -> u32 { + self.doc_freq + } + + /// Returns the array of docs in the current block. + /// + /// Before the first call to `.advance()`, the block + /// returned by `.docs()` is empty. + #[inline] + pub fn docs(&self) -> &[DocId] { + debug_assert!(self.block_loaded); + self.doc_decoder.output_array() + } + + /// Return the document at index `idx` of the block. + #[inline] + pub fn doc(&self, idx: usize) -> u32 { + self.doc_decoder.output(idx) + } + + /// Return the array of `term freq` in the block. + #[inline] + pub fn freqs(&self) -> &[u32] { + debug_assert!(self.block_loaded); + self.freq_decoder.output_array() + } + + /// Return the frequency at index `idx` of the block. + #[inline] + pub fn freq(&self, idx: usize) -> u32 { + debug_assert!(self.block_loaded); + self.freq_decoder.output(idx) + } + + /// Position on a block that may contains `target_doc`. + /// + /// If all docs are smaller than target, the block loaded may be empty, + /// or be the last an incomplete VInt block. + pub fn seek(&mut self, target_doc: DocId) -> usize { + // Move to the block that might contain our document. + self.seek_block_without_loading(target_doc); + self.load_block(); + + // At this point we are on the block that might contain our document. + let doc = self.doc_decoder.seek_within_block(target_doc); + + // The last block is not full and padded with TERMINATED, + // so we are guaranteed to have at least one value (real or padding) + // that is >= target_doc. + debug_assert!(doc < COMPRESSION_BLOCK_SIZE); + + // `doc` is now the first element >= `target_doc`. + // If all docs are smaller than target, the current block is incomplete and padded + // with TERMINATED. After the search, the cursor points to the first TERMINATED. + doc + } + + /// Returns the current position offset in the position reader. + pub fn position_offset(&self) -> u64 { + self.skip_reader.position_offset() + } + + /// Advance to the next block. + pub fn advance(&mut self) { + self.skip_reader.advance(); + self.block_loaded = false; + self.block_max_score_cache = None; + self.load_block(); + } /// Returns the block_max_score for the current block. /// It does not require the block to be loaded. For instance, it is ok to call this method @@ -160,7 +231,7 @@ impl BlockSegmentPostings { } // this is the last block of the segment posting list. // If it is actually loaded, we can compute block max manually. - if self.block_is_loaded() { + if self.block_loaded { let docs = self.doc_decoder.output_array().iter().cloned(); let freqs = self.freq_decoder.output_array().iter().cloned(); let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| { @@ -177,112 +248,25 @@ impl BlockSegmentPostings { // We do not cache it however, so that it gets computed when once block is loaded. bm25_weight.max_score() } +} - pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { - self.freq_reading_option - } - - // Resets the block segment postings on another position - // in the postings file. - // - // This is useful for enumerating through a list of terms, - // and consuming the associated posting lists while avoiding - // reallocating a `BlockSegmentPostings`. - // - // # Warning - // - // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> { - let (skip_data_opt, postings_data) = - split_into_skips_and_postings(doc_freq, postings_data)?; - self.data = postings_data; - self.block_max_score_cache = None; - self.block_loaded = false; - if let Some(skip_data) = skip_data_opt { - self.skip_reader.reset(skip_data, doc_freq); - } else { - self.skip_reader.reset(OwnedBytes::empty(), doc_freq); +impl BlockSegmentPostings { + /// Returns an empty segment postings object + pub fn empty() -> BlockSegmentPostings { + BlockSegmentPostings { + doc_decoder: BlockDecoder::with_val(TERMINATED), + block_loaded: true, + freq_decoder: BlockDecoder::with_val(1), + freq_reading_option: FreqReadingOption::NoFreq, + block_max_score_cache: None, + doc_freq: 0, + data: OwnedBytes::empty(), + skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), } - self.doc_freq = doc_freq; - self.load_block(); - Ok(()) } - /// Returns the overall number of documents in the block postings. - /// It does not take in account whether documents are deleted or not. - /// - /// This `doc_freq` is simply the sum of the length of all of the blocks - /// length, and it does not take in account deleted documents. - pub fn doc_freq(&self) -> u32 { - self.doc_freq - } - - /// Returns the array of docs in the current block. - /// - /// Before the first call to `.advance()`, the block - /// returned by `.docs()` is empty. - #[inline] - pub fn docs(&self) -> &[DocId] { - debug_assert!(self.block_is_loaded()); - self.doc_decoder.output_array() - } - - /// Return the document at index `idx` of the block. - #[inline] - pub fn doc(&self, idx: usize) -> u32 { - self.doc_decoder.output(idx) - } - - /// Return the array of `term freq` in the block. - #[inline] - pub fn freqs(&self) -> &[u32] { - debug_assert!(self.block_is_loaded()); - self.freq_decoder.output_array() - } - - /// Return the frequency at index `idx` of the block. - #[inline] - pub fn freq(&self, idx: usize) -> u32 { - debug_assert!(self.block_is_loaded()); - self.freq_decoder.output(idx) - } - - /// Returns the length of the current block. - /// - /// All blocks have a length of `NUM_DOCS_PER_BLOCK`, - /// except the last block that may have a length - /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` - #[inline] - pub fn block_len(&self) -> usize { - debug_assert!(self.block_is_loaded()); - self.doc_decoder.output_len - } - - /// Position on a block that may contains `target_doc`. - /// - /// If all docs are smaller than target, the block loaded may be empty, - /// or be the last an incomplete VInt block. - pub fn seek(&mut self, target_doc: DocId) -> usize { - // Move to the block that might contain our document. - self.seek_block(target_doc); - self.load_block(); - - // At this point we are on the block that might contain our document. - let doc = self.doc_decoder.seek_within_block(target_doc); - - // The last block is not full and padded with TERMINATED, - // so we are guaranteed to have at least one value (real or padding) - // that is >= target_doc. - debug_assert!(doc < COMPRESSION_BLOCK_SIZE); - - // `doc` is now the first element >= `target_doc`. - // If all docs are smaller than target, the current block is incomplete and padded - // with TERMINATED. After the search, the cursor points to the first TERMINATED. - doc - } - - pub(crate) fn position_offset(&self) -> u64 { - self.skip_reader.position_offset() + pub(crate) fn skip_reader(&self) -> &SkipReader { + &self.skip_reader } /// Dangerous API! This calls seeks the next block on the skip list, @@ -291,19 +275,15 @@ impl BlockSegmentPostings { /// `.load_block()` needs to be called manually afterwards. /// If all docs are smaller than target, the block loaded may be empty, /// or be the last an incomplete VInt block. - pub(crate) fn seek_block(&mut self, target_doc: DocId) { + pub(crate) fn seek_block_without_loading(&mut self, target_doc: DocId) { if self.skip_reader.seek(target_doc) { self.block_max_score_cache = None; self.block_loaded = false; } } - pub(crate) fn block_is_loaded(&self) -> bool { - self.block_loaded - } - pub(crate) fn load_block(&mut self) { - if self.block_is_loaded() { + if self.block_loaded { return; } let offset = self.skip_reader.byte_offset(); @@ -351,68 +331,39 @@ impl BlockSegmentPostings { } self.block_loaded = true; } - - /// Advance to the next block. - pub fn advance(&mut self) { - self.skip_reader.advance(); - self.block_loaded = false; - self.block_max_score_cache = None; - self.load_block(); - } - - /// Returns an empty segment postings object - pub fn empty() -> BlockSegmentPostings { - BlockSegmentPostings { - doc_decoder: BlockDecoder::with_val(TERMINATED), - block_loaded: true, - freq_decoder: BlockDecoder::with_val(1), - freq_reading_option: FreqReadingOption::NoFreq, - block_max_score_cache: None, - doc_freq: 0, - data: OwnedBytes::empty(), - skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), - } - } - - pub(crate) fn skip_reader(&self) -> &SkipReader { - &self.skip_reader - } } #[cfg(test)] mod tests { - use common::HasLen; + use common::OwnedBytes; use super::BlockSegmentPostings; use crate::docset::{DocSet, TERMINATED}; - use crate::index::Index; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; - use crate::postings::postings::Postings; + use crate::postings::serializer::PostingsSerializer; use crate::postings::SegmentPostings; - use crate::schema::{IndexRecordOption, Schema, Term, INDEXED}; - use crate::DocId; + use crate::schema::IndexRecordOption; - #[test] - fn test_empty_segment_postings() { - let mut postings = SegmentPostings::empty(); - assert_eq!(postings.doc(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.doc_freq(), 0); - assert_eq!(postings.len(), 0); - } - - #[test] - fn test_empty_postings_doc_returns_terminated() { - let mut postings = SegmentPostings::empty(); - assert_eq!(postings.doc(), TERMINATED); - assert_eq!(postings.advance(), TERMINATED); - } - - #[test] - fn test_empty_postings_doc_term_freq_returns_0() { - let postings = SegmentPostings::empty(); - assert_eq!(postings.term_freq(), 1); + #[cfg(test)] + fn build_block_postings(docs: &[u32]) -> BlockSegmentPostings { + let doc_freq = docs.len() as u32; + let mut postings_serializer = + PostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None); + postings_serializer.new_term(docs.len() as u32, false); + for doc in docs { + postings_serializer.write_doc(*doc, 1u32); + } + let mut buffer: Vec = Vec::new(); + postings_serializer + .close_term(doc_freq, &mut buffer) + .unwrap(); + BlockSegmentPostings::open( + doc_freq, + OwnedBytes::new(buffer), + IndexRecordOption::Basic, + IndexRecordOption::Basic, + ) + .unwrap() } #[test] @@ -427,7 +378,7 @@ mod tests { #[test] fn test_block_segment_postings() -> crate::Result<()> { - let mut block_segments = build_block_postings(&(0..100_000).collect::>())?; + let mut block_segments = build_block_postings(&(0..100_000).collect::>()); let mut offset: u32 = 0u32; // checking that the `doc_freq` is correct assert_eq!(block_segments.doc_freq(), 100_000); @@ -452,7 +403,7 @@ mod tests { doc_ids.push(129); doc_ids.push(130); { - let block_segments = build_block_postings(&doc_ids)?; + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.seek(128), 129); assert_eq!(docset.doc(), 129); @@ -461,7 +412,7 @@ mod tests { assert_eq!(docset.advance(), TERMINATED); } { - let block_segments = build_block_postings(&doc_ids).unwrap(); + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.seek(129), 129); assert_eq!(docset.doc(), 129); @@ -470,7 +421,7 @@ mod tests { assert_eq!(docset.advance(), TERMINATED); } { - let block_segments = build_block_postings(&doc_ids)?; + let block_segments = build_block_postings(&doc_ids); let mut docset = SegmentPostings::from_block_postings(block_segments, None); assert_eq!(docset.doc(), 0); assert_eq!(docset.seek(131), TERMINATED); @@ -479,38 +430,13 @@ mod tests { Ok(()) } - fn build_block_postings(docs: &[DocId]) -> crate::Result { - let mut schema_builder = Schema::builder(); - let int_field = schema_builder.add_u64_field("id", INDEXED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - let mut last_doc = 0u32; - for &doc in docs { - for _ in last_doc..doc { - index_writer.add_document(doc!(int_field=>1u64))?; - } - index_writer.add_document(doc!(int_field=>0u64))?; - last_doc = doc + 1; - } - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(int_field).unwrap(); - let term = Term::from_field_u64(int_field, 0u64); - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - let block_postings = inverted_index - .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; - Ok(block_postings) - } - #[test] fn test_block_segment_postings_seek() -> crate::Result<()> { - let mut docs = vec![0]; + let mut docs = Vec::new(); for i in 0..1300 { docs.push((i * i / 100) + i); } - let mut block_postings = build_block_postings(&docs[..])?; + let mut block_postings = build_block_postings(&docs[..]); for i in &[0, 424, 10000] { block_postings.seek(*i); let docs = block_postings.docs(); @@ -521,40 +447,4 @@ mod tests { assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); Ok(()) } - - #[test] - fn test_reset_block_segment_postings() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let int_field = schema_builder.add_u64_field("id", INDEXED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - // create two postings list, one containing even number, - // the other containing odd numbers. - for i in 0..6 { - let doc = doc!(int_field=> (i % 2) as u64); - index_writer.add_document(doc)?; - } - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let segment_reader = searcher.segment_reader(0); - - let mut block_segments; - { - let term = Term::from_field_u64(int_field, 0u64); - let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - block_segments = inverted_index - .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; - } - assert_eq!(block_segments.docs(), &[0, 2, 4]); - { - let term = Term::from_field_u64(int_field, 1u64); - let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term)?.unwrap(); - inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?; - } - assert_eq!(block_segments.docs(), &[1, 3, 5]); - Ok(()) - } } diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index 99de17446..ca6c8248b 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -22,12 +22,6 @@ pub(crate) struct JsonPostingsWriter { non_str_posting_writer: SpecializedPostingsWriter, } -impl From> for Box { - fn from(json_postings_writer: JsonPostingsWriter) -> Box { - Box::new(json_postings_writer) - } -} - impl PostingsWriter for JsonPostingsWriter { #[inline] fn subscribe( diff --git a/src/postings/loaded_postings.rs b/src/postings/loaded_postings.rs index 7258f5cea..52bd4bb9e 100644 --- a/src/postings/loaded_postings.rs +++ b/src/postings/loaded_postings.rs @@ -1,5 +1,5 @@ use crate::docset::{DocSet, TERMINATED}; -use crate::postings::{Postings, SegmentPostings}; +use crate::postings::{DocFreq, Postings}; use crate::DocId; /// `LoadedPostings` is a `DocSet` and `Postings` implementation. @@ -25,16 +25,16 @@ impl LoadedPostings { /// Creates a new `LoadedPostings` from a `SegmentPostings`. /// /// It will also preload positions, if positions are available in the SegmentPostings. - pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings { - let num_docs = segment_postings.doc_freq() as usize; + pub fn load(postings: &mut Box) -> LoadedPostings { + let num_docs: usize = u32::from(postings.doc_freq()) as usize; let mut doc_ids = Vec::with_capacity(num_docs); let mut positions = Vec::with_capacity(num_docs); let mut position_offsets = Vec::with_capacity(num_docs); - while segment_postings.doc() != TERMINATED { + while postings.doc() != TERMINATED { position_offsets.push(positions.len() as u32); - doc_ids.push(segment_postings.doc()); - segment_postings.append_positions_with_offset(0, &mut positions); - segment_postings.advance(); + doc_ids.push(postings.doc()); + postings.append_positions_with_offset(0, &mut positions); + postings.advance(); } position_offsets.push(positions.len() as u32); LoadedPostings { @@ -101,6 +101,14 @@ impl Postings for LoadedPostings { output.push(*pos + offset); } } + + fn has_freq(&self) -> bool { + true + } + + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.doc_ids.len() as u32) + } } #[cfg(test)] diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 13b6761cf..9a248bd1a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -1,9 +1,16 @@ //! Postings module (also called inverted index) +use std::io; + +use common::OwnedBytes; + +use crate::fieldnorm::FieldNormReader; +use crate::positions::PositionReader; +use crate::query::Bm25Weight; +use crate::schema::IndexRecordOption; +use crate::Score; + mod block_search; - -pub(crate) use self::block_search::branchless_binary_search; - mod block_segment_postings; pub(crate) mod compression; mod indexing_context; @@ -16,22 +23,53 @@ mod recorder; mod segment_postings; /// Serializer module for the inverted index pub mod serializer; -mod skip; +pub(crate) mod skip; mod term_info; pub(crate) use loaded_postings::LoadedPostings; pub(crate) use stacker::compute_table_memory_size; +pub(crate) use self::block_search::branchless_binary_search; pub use self::block_segment_postings::BlockSegmentPostings; pub(crate) use self::indexing_context::IndexingContext; pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter; -pub use self::postings::Postings; -pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter}; +pub use self::postings::{DocFreq, Postings}; +pub(crate) use self::postings_writer::{ + serialize_postings, IndexingPosition, PostingsWriter, PostingsWriterEnum, +}; pub use self::segment_postings::SegmentPostings; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; -pub(crate) use self::skip::{BlockInfo, SkipReader}; pub use self::term_info::TermInfo; +/// Raw postings bytes and metadata read from storage. +#[derive(Debug, Clone)] +pub struct RawPostingsData { + /// Raw postings bytes for the term. + pub postings_data: OwnedBytes, + /// Raw positions bytes for the term, if positions are available. + pub positions_data: Option, + /// Record option of the indexed field. + pub record_option: IndexRecordOption, + /// Effective record option after downgrading to the indexed field capability. + pub effective_option: IndexRecordOption, +} + +/// A light complement interface to Postings to allow block-max wand acceleration. +pub trait PostingsWithBlockMax: Postings { + /// Moves the postings to the block containing `target_doc` and returns + /// an upperbound of the score for documents in the block. + fn seek_block_max( + &mut self, + target_doc: crate::DocId, + fieldnorm_reader: &FieldNormReader, + similarity_weight: &Bm25Weight, + ) -> Score; + + /// Returns the last document in the current block (or Terminated if this + /// is the last block). + fn last_doc_in_block(&self) -> crate::DocId; +} + #[expect(clippy::enum_variant_names)] #[derive(Debug, PartialEq, Clone, Copy, Eq)] pub(crate) enum FreqReadingOption { @@ -40,6 +78,27 @@ pub(crate) enum FreqReadingOption { ReadFreq, } +/// Load postings from raw data bytes into a `SegmentPostings` object. +pub fn load_postings_from_raw_data( + doc_freq: u32, + postings_data: RawPostingsData, +) -> io::Result { + let RawPostingsData { + postings_data, + positions_data: positions_data_opt, + record_option, + effective_option, + } = postings_data; + let requested_option = effective_option; + let block_segment_postings = + BlockSegmentPostings::open(doc_freq, postings_data, record_option, requested_option)?; + let position_reader = positions_data_opt.map(PositionReader::open).transpose()?; + Ok(SegmentPostings::from_block_postings( + block_segment_postings, + position_reader, + )) +} + #[cfg(test)] pub(crate) mod tests { use std::mem; @@ -47,9 +106,10 @@ pub(crate) mod tests { use super::{InvertedIndexSerializer, Postings}; use crate::docset::{DocSet, TERMINATED}; use crate::fieldnorm::FieldNormReader; - use crate::index::{Index, SegmentComponent, SegmentReader}; + use crate::index::{Index, SegmentComponent}; use crate::indexer::operation::AddOperation; use crate::indexer::SegmentWriter; + use crate::postings::DocFreq; use crate::query::Scorer; use crate::schema::{ Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT, @@ -259,7 +319,7 @@ pub(crate) mod tests { segment_writer.finalize()?; } { - let segment_reader = SegmentReader::open(&segment)?; + let segment_reader = crate::TantivySegmentReader::open(&segment)?; { let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?; assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); @@ -280,11 +340,11 @@ pub(crate) mod tests { } { let term_a = Term::from_field_text(text_field, "a"); - let mut postings_a = segment_reader + let mut postings_a: Box = segment_reader .inverted_index(term_a.field())? .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); - assert_eq!(postings_a.len(), 1000); + assert_eq!(postings_a.doc_freq(), DocFreq::Exact(1000)); assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); postings_a.positions(&mut positions); @@ -307,7 +367,7 @@ pub(crate) mod tests { .inverted_index(term_e.field())? .read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); - assert_eq!(postings_e.len(), 1000 - 2); + assert_eq!(postings_e.doc_freq(), DocFreq::Exact(1000 - 2)); for i in 2u32..1000u32 { assert_eq!(postings_e.term_freq(), i); postings_e.positions(&mut positions); diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index f3d6d6534..bf547cf5a 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -1,16 +1,15 @@ use crate::postings::json_postings_writer::JsonPostingsWriter; -use crate::postings::postings_writer::SpecializedPostingsWriter; +use crate::postings::postings_writer::{PostingsWriterEnum, SpecializedPostingsWriter}; use crate::postings::recorder::{DocIdRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; -use crate::postings::PostingsWriter; use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema}; pub(crate) struct PerFieldPostingsWriter { - per_field_postings_writers: Vec>, + per_field_postings_writers: Vec, } impl PerFieldPostingsWriter { pub fn for_schema(schema: &Schema) -> Self { - let per_field_postings_writers = schema + let per_field_postings_writers: Vec = schema .fields() .map(|(_, field_entry)| posting_writer_from_field_entry(field_entry)) .collect(); @@ -19,16 +18,16 @@ impl PerFieldPostingsWriter { } } - pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter { - self.per_field_postings_writers[field.field_id() as usize].as_ref() + pub(crate) fn get_for_field(&self, field: Field) -> &PostingsWriterEnum { + &self.per_field_postings_writers[field.field_id() as usize] } - pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter { - self.per_field_postings_writers[field.field_id() as usize].as_mut() + pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut PostingsWriterEnum { + &mut self.per_field_postings_writers[field.field_id() as usize] } } -fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box { +fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> PostingsWriterEnum { match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options .get_indexing_options() @@ -51,7 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box Box::>::default(), + | FieldType::Facet(_) => >::default().into(), FieldType::JsonObject(ref json_object_options) => { if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { match text_indexing_option.index_option() { diff --git a/src/postings/postings.rs b/src/postings/postings.rs index 8606f00a9..e216aa9e9 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -1,5 +1,25 @@ use crate::docset::DocSet; +/// Result of the doc_freq method. +/// +/// Postings can inform us that the document frequency is approximate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DocFreq { + /// The document frequency is approximate. + Approximate(u32), + /// The document frequency is exact. + Exact(u32), +} + +impl From for u32 { + fn from(doc_freq: DocFreq) -> Self { + match doc_freq { + DocFreq::Approximate(approximate_doc_freq) => approximate_doc_freq, + DocFreq::Exact(doc_freq) => doc_freq, + } + } +} + /// Postings (also called inverted list) /// /// For a given term, it is the list of doc ids of the doc @@ -14,6 +34,9 @@ pub trait Postings: DocSet + 'static { /// The number of times the term appears in the document. fn term_freq(&self) -> u32; + /// Returns the number of documents containing the term in the segment. + fn doc_freq(&self) -> DocFreq; + /// Returns the positions offsetted with a given value. /// It is not necessary to clear the `output` before calling this method. /// The output vector will be resized to the `term_freq`. @@ -31,6 +54,16 @@ pub trait Postings: DocSet + 'static { fn positions(&mut self, output: &mut Vec) { self.positions_with_offset(0u32, output); } + + /// Returns true if the term_frequency is available. + /// + /// This is a tricky question, because on JSON fields, it is possible + /// for a text term to have term freq, whereas a number term in the field has none. + /// + /// This function returns whether the actual term has term frequencies or not. + /// In this above JSON field example, `has_freq` should return true for the + /// earlier and false for the latter. + fn has_freq(&self) -> bool; } impl Postings for Box { @@ -41,4 +74,12 @@ impl Postings for Box { fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { (**self).append_positions_with_offset(offset, output); } + + fn has_freq(&self) -> bool { + (**self).has_freq() + } + + fn doc_freq(&self) -> DocFreq { + (**self).doc_freq() + } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c7a94ecef..816f5c184 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -7,7 +7,10 @@ use stacker::Addr; use crate::fieldnorm::FieldNormReaders; use crate::indexer::indexing_term::IndexingTerm; use crate::indexer::path_to_unordered_id::OrderedPathId; -use crate::postings::recorder::{BufferLender, Recorder}; +use crate::postings::json_postings_writer::JsonPostingsWriter; +use crate::postings::recorder::{ + BufferLender, DocIdRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder, +}; use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, }; @@ -100,6 +103,141 @@ pub(crate) struct IndexingPosition { pub end_position: u32, } +pub enum PostingsWriterEnum { + DocId(SpecializedPostingsWriter), + DocIdTf(SpecializedPostingsWriter), + DocTfAndPosition(SpecializedPostingsWriter), + JsonDocId(JsonPostingsWriter), + JsonDocIdTf(JsonPostingsWriter), + JsonDocTfAndPosition(JsonPostingsWriter), +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_recorder_writer: SpecializedPostingsWriter) -> Self { + PostingsWriterEnum::DocId(doc_id_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_tf_recorder_writer: SpecializedPostingsWriter) -> Self { + PostingsWriterEnum::DocIdTf(doc_id_tf_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from( + doc_id_tf_and_positions_recorder_writer: SpecializedPostingsWriter, + ) -> Self { + PostingsWriterEnum::DocTfAndPosition(doc_id_tf_and_positions_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_recorder_writer: JsonPostingsWriter) -> Self { + PostingsWriterEnum::JsonDocId(doc_id_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from(doc_id_tf_recorder_writer: JsonPostingsWriter) -> Self { + PostingsWriterEnum::JsonDocIdTf(doc_id_tf_recorder_writer) + } +} + +impl From> for PostingsWriterEnum { + fn from( + doc_id_tf_and_positions_recorder_writer: JsonPostingsWriter, + ) -> Self { + PostingsWriterEnum::JsonDocTfAndPosition(doc_id_tf_and_positions_recorder_writer) + } +} + +impl PostingsWriter for PostingsWriterEnum { + fn subscribe(&mut self, doc: DocId, pos: u32, term: &IndexingTerm, ctx: &mut IndexingContext) { + match self { + PostingsWriterEnum::DocId(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::DocIdTf(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::DocTfAndPosition(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocId(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocIdTf(writer) => writer.subscribe(doc, pos, term, ctx), + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.subscribe(doc, pos, term, ctx) + } + } + } + + fn serialize( + &self, + term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], + ordered_id_to_path: &[&str], + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + match self { + PostingsWriterEnum::DocId(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::DocIdTf(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::DocTfAndPosition(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocId(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocIdTf(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer) + } + } + } + + /// Tokenize a text and subscribe all of its token. + fn index_text( + &mut self, + doc_id: DocId, + token_stream: &mut dyn TokenStream, + term_buffer: &mut IndexingTerm, + ctx: &mut IndexingContext, + indexing_position: &mut IndexingPosition, + ) { + match self { + PostingsWriterEnum::DocId(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::DocIdTf(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::DocTfAndPosition(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocId(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocIdTf(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + PostingsWriterEnum::JsonDocTfAndPosition(writer) => { + writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position) + } + } + } + + fn total_num_tokens(&self) -> u64 { + match self { + PostingsWriterEnum::DocId(writer) => writer.total_num_tokens(), + PostingsWriterEnum::DocIdTf(writer) => writer.total_num_tokens(), + PostingsWriterEnum::DocTfAndPosition(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocId(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocIdTf(writer) => writer.total_num_tokens(), + PostingsWriterEnum::JsonDocTfAndPosition(writer) => writer.total_num_tokens(), + } + } +} + /// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. /// @@ -171,14 +309,6 @@ pub(crate) struct SpecializedPostingsWriter { _recorder_type: PhantomData, } -impl From> for Box { - fn from( - specialized_postings_writer: SpecializedPostingsWriter, - ) -> Box { - Box::new(specialized_postings_writer) - } -} - impl SpecializedPostingsWriter { #[inline] pub(crate) fn serialize_one_term( diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 58610c139..76fff8844 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -70,7 +70,7 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ); /// Returns the number of document containing this term. @@ -113,7 +113,7 @@ impl Recorder for DocIdRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let buffer = buffer_lender.lend_u8(); @@ -181,7 +181,7 @@ impl Recorder for TermFrequencyRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let buffer = buffer_lender.lend_u8(); @@ -238,7 +238,7 @@ impl Recorder for TfAndPositionRecorder { fn serialize( &self, arena: &MemoryArena, - serializer: &mut FieldSerializer<'_>, + serializer: &mut FieldSerializer, buffer_lender: &mut BufferLender, ) { let (buffer_u8, buffer_positions) = buffer_lender.lend_all(); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index e8928b90d..c1dba8665 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,11 +1,13 @@ -use common::HasLen; +use common::BitSet; +use super::{BlockSegmentPostings, PostingsWithBlockMax}; use crate::docset::DocSet; -use crate::fastfield::AliveBitSet; +use crate::fieldnorm::FieldNormReader; use crate::positions::PositionReader; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; -use crate::postings::{BlockSegmentPostings, Postings}; -use crate::{DocId, TERMINATED}; +use crate::postings::{DocFreq, Postings}; +use crate::query::Bm25Weight; +use crate::{DocId, Score}; /// `SegmentPostings` represents the inverted list or postings associated with /// a term in a `Segment`. @@ -29,31 +31,6 @@ impl SegmentPostings { } } - /// Compute the number of non-deleted documents. - /// - /// This method will clone and scan through the posting lists. - /// (this is a rather expensive operation). - pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 { - let mut docset = self.clone(); - let mut doc_freq = 0; - loop { - let doc = docset.doc(); - if doc == TERMINATED { - return doc_freq; - } - if alive_bitset.is_alive(doc) { - doc_freq += 1u32; - } - docset.advance(); - } - } - - /// Returns the overall number of documents in the block postings. - /// It does not take in account whether documents are deleted or not. - pub fn doc_freq(&self) -> u32 { - self.block_cursor.doc_freq() - } - /// Creates a segment postings object with the given documents /// and no frequency encoded. /// @@ -64,11 +41,13 @@ impl SegmentPostings { /// buffer with the serialized data. #[cfg(test)] pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { - use crate::directory::FileSlice; - use crate::postings::serializer::PostingsSerializer; + use common::OwnedBytes; + use crate::schema::IndexRecordOption; let mut buffer = Vec::new(); { + use crate::postings::serializer::PostingsSerializer; + let mut postings_serializer = PostingsSerializer::new(0.0, IndexRecordOption::Basic, None); postings_serializer.new_term(docs.len() as u32, false); @@ -81,7 +60,7 @@ impl SegmentPostings { } let block_segment_postings = BlockSegmentPostings::open( docs.len() as u32, - FileSlice::from(buffer), + OwnedBytes::new(buffer), IndexRecordOption::Basic, IndexRecordOption::Basic, ) @@ -95,7 +74,8 @@ impl SegmentPostings { doc_and_tfs: &[(u32, u32)], fieldnorms: Option<&[u32]>, ) -> SegmentPostings { - use crate::directory::FileSlice; + use common::OwnedBytes; + use crate::fieldnorm::FieldNormReader; use crate::postings::serializer::PostingsSerializer; use crate::schema::IndexRecordOption; @@ -128,7 +108,7 @@ impl SegmentPostings { .unwrap(); let block_segment_postings = BlockSegmentPostings::open( doc_and_tfs.len() as u32, - FileSlice::from(buffer), + OwnedBytes::new(buffer), IndexRecordOption::WithFreqs, IndexRecordOption::WithFreqs, ) @@ -158,7 +138,6 @@ impl DocSet for SegmentPostings { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> DocId { - debug_assert!(self.block_cursor.block_is_loaded()); if self.cur == COMPRESSION_BLOCK_SIZE - 1 { self.cur = 0; self.block_cursor.advance(); @@ -197,13 +176,31 @@ impl DocSet for SegmentPostings { } fn size_hint(&self) -> u32 { - self.len() as u32 + self.doc_freq().into() } -} -impl HasLen for SegmentPostings { - fn len(&self) -> usize { - self.block_cursor.doc_freq() as usize + fn fill_bitset(&mut self, bitset: &mut BitSet) { + let bitset_max_value: DocId = bitset.max_value(); + loop { + let docs = self.block_cursor.docs(); + let Some(&last_doc) = docs.last() else { + break; + }; + if last_doc < bitset_max_value { + // All docs are within the range of the bitset + for &doc in docs { + bitset.insert(doc); + } + } else { + for &doc in docs { + if doc < bitset_max_value { + bitset.insert(doc); + } + } + break; + } + self.block_cursor.advance(); + } } } @@ -229,6 +226,13 @@ impl Postings for SegmentPostings { self.block_cursor.freq(self.cur) } + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. + #[inline(always)] + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.block_cursor.doc_freq()) + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { let term_freq = self.term_freq(); let prev_len = output.len(); @@ -252,24 +256,44 @@ impl Postings for SegmentPostings { } } } + + fn has_freq(&self) -> bool { + !self.block_cursor.freqs().is_empty() + } +} + +impl PostingsWithBlockMax for SegmentPostings { + #[inline] + fn seek_block_max( + &mut self, + target_doc: crate::DocId, + fieldnorm_reader: &FieldNormReader, + similarity_weight: &Bm25Weight, + ) -> Score { + self.block_cursor.seek_block_without_loading(target_doc); + self.block_cursor + .block_max_score(fieldnorm_reader, similarity_weight) + } + + #[inline] + fn last_doc_in_block(&self) -> crate::DocId { + self.block_cursor.skip_reader().last_doc_in_block() + } } #[cfg(test)] mod tests { - - use common::HasLen; - use super::SegmentPostings; use crate::docset::{DocSet, TERMINATED}; - use crate::fastfield::AliveBitSet; - use crate::postings::postings::Postings; + use crate::postings::Postings; #[test] fn test_empty_segment_postings() { let mut postings = SegmentPostings::empty(); + assert_eq!(postings.doc(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); - assert_eq!(postings.len(), 0); + assert_eq!(postings.doc_freq(), crate::postings::DocFreq::Exact(0)); } #[test] @@ -284,15 +308,4 @@ mod tests { let postings = SegmentPostings::empty(); assert_eq!(postings.term_freq(), 1); } - - #[test] - fn test_doc_freq() { - let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); - assert_eq!(docs.doc_freq(), 3); - let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); - assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2); - let all_deleted = - AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); - assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); - } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 726cce03b..722712c33 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -8,7 +8,7 @@ use crate::directory::{CompositeWrite, WritePtr}; use crate::fieldnorm::FieldNormReader; use crate::index::Segment; use crate::positions::PositionSerializer; -use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; +use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE}; use crate::postings::skip::SkipSerializer; use crate::query::Bm25Weight; use crate::schema::{Field, FieldEntry, IndexRecordOption, Schema}; diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 3900fd40e..e4a3584bd 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -146,23 +146,6 @@ impl SkipReader { skip_reader } - pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) { - self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { - 0 - } else { - TERMINATED - }; - self.last_doc_in_previous_block = 0u32; - self.owned_read = data; - self.block_info = BlockInfo::VInt { num_docs: doc_freq }; - self.byte_offset = 0; - self.remaining_docs = doc_freq; - self.position_offset = 0u64; - if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { - self.read_block_info(); - } - } - // Returns the block max score for this block if available. // // The block max score is available for all full bitpacked block, diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 5431a3a1b..aad3fbb2a 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -2,7 +2,7 @@ use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; use crate::index::SegmentReader; use crate::query::boost_query::BoostScorer; use crate::query::explanation::does_not_match; -use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, Score}; /// Query that matches all of the documents. @@ -21,16 +21,16 @@ impl Query for AllQuery { pub struct AllWeight; impl Weight for AllWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let all_scorer = AllScorer::new(reader.max_doc()); if boost != 1.0 { - Ok(Box::new(BoostScorer::new(all_scorer, boost))) + Ok(box_scorer(BoostScorer::new(all_scorer, boost))) } else { - Ok(Box::new(all_scorer)) + Ok(box_scorer(all_scorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { if doc >= reader.max_doc() { return Err(does_not_match(doc)); } diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 5f1053fb6..a5e9e1529 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -5,12 +5,14 @@ use common::BitSet; use tantivy_fst::Automaton; use super::phrase_prefix_query::prefix_end; -use crate::index::SegmentReader; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; use crate::postings::TermInfo; use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight}; -use crate::schema::{Field, IndexRecordOption}; +use crate::schema::Field; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score, TantivyError}; +use crate::{DocId, DocSet, Score, TantivyError}; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight { @@ -67,7 +69,7 @@ where } /// Returns the term infos that match the automaton - pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result> { + pub fn get_match_term_infos(&self, reader: &dyn SegmentReader) -> crate::Result> { let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_stream = self.automaton_stream(term_dict)?; @@ -84,33 +86,42 @@ where A: Automaton + Send + Sync + 'static, A::State: Clone, { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_stream = self.automaton_stream(term_dict)?; - while term_stream.advance() { - let term_info = term_stream.value(); - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; + struct FillBitsetLoop<'a, 'b, A: Automaton> + where A::State: Clone + { + term_stream: &'a mut TermStreamer<'b, &'b A>, + bitset: &'a mut BitSet, + } + impl TypedInvertedIndexReaderCb> for FillBitsetLoop<'_, '_, A> + where A::State: Clone + { + fn call(&mut self, reader: &I) -> io::Result<()> { + while self.term_stream.advance() { + let term_info = self.term_stream.value(); + reader.fill_bitset_from_terminfo(term_info, self.bitset)?; } - for &doc in docs { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); + Ok(()) } } + try_downcast_and_call( + inverted_index.as_ref(), + &mut FillBitsetLoop { + term_stream: &mut term_stream, + bitset: &mut doc_bitset, + }, + )?; let doc_bitset = BitSetDocSet::from(doc_bitset); let const_scorer = ConstScorer::new(doc_bitset, boost); Ok(Box::new(const_scorer)) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) == doc { Ok(Explanation::new("AutomatonScorer", 1.0)) diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index d25034c8e..dee61b7ee 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -24,6 +24,13 @@ impl BitSetDocSet { self.cursor_bucket = bucket_addr; self.cursor_tinybitset = self.docs.tinyset(bucket_addr); } + + /// Returns the number of documents in the bitset. + /// + /// This call is not free: it will bitcount the number of bits in the bitset. + pub fn doc_freq(&self) -> u32 { + self.docs.len() as u32 + } } impl From for BitSetDocSet { diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 6b2f2d6e3..5f239dc9e 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -1,5 +1,6 @@ use std::ops::{Deref, DerefMut}; +use crate::postings::PostingsWithBlockMax; use crate::query::term_query::TermScorer; use crate::query::Scorer; use crate::{DocId, DocSet, Score, TERMINATED}; @@ -13,8 +14,8 @@ use crate::{DocId, DocSet, Score, TERMINATED}; /// We always have `before_pivot_len` < `pivot_len`. /// /// `None` is returned if we establish that no document can exceed the threshold. -fn find_pivot_doc( - term_scorers: &[TermScorerWithMaxScore], +fn find_pivot_doc( + term_scorers: &[TermScorerWithMaxScore], threshold: Score, ) -> Option<(usize, usize, DocId)> { let mut max_score = 0.0; @@ -46,8 +47,8 @@ fn find_pivot_doc( /// the next doc candidate defined by the min of `last_doc_in_block + 1` for /// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..]. /// Note: before and after calling this method, scorers need to be sorted by their `.doc()`. -fn block_max_was_too_low_advance_one_scorer( - scorers: &mut [TermScorerWithMaxScore], +fn block_max_was_too_low_advance_one_scorer( + scorers: &mut [TermScorerWithMaxScore], pivot_len: usize, ) { debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); @@ -82,7 +83,10 @@ fn block_max_was_too_low_advance_one_scorer( // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // except term_scorers[ord] that might be in advance compared to its ranks, // bubble up term_scorers[ord] in order to restore the ordering. -fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) { +fn restore_ordering( + term_scorers: &mut [TermScorerWithMaxScore], + ord: usize, +) { let doc = term_scorers[ord].doc(); for i in ord + 1..term_scorers.len() { if term_scorers[i].doc() >= doc { @@ -97,9 +101,10 @@ fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) { // If this works, return true. // If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the // pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`. -// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return. -fn align_scorers( - term_scorers: &mut Vec, +// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and +// return. +fn align_scorers( + term_scorers: &mut Vec>, pivot_doc: DocId, before_pivot_len: usize, ) -> bool { @@ -126,7 +131,10 @@ fn align_scorers( // Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc). // Advance term_scorers[..pivot_len] and out of these removes the terminated scores. // Restores the ordering of term_scorers. -fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, pivot_len: usize) { +fn advance_all_scorers_on_pivot( + term_scorers: &mut Vec>, + pivot_len: usize, +) { for term_scorer in &mut term_scorers[..pivot_len] { term_scorer.advance(); } @@ -145,12 +153,12 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, /// Implements the WAND (Weak AND) algorithm for dynamic pruning /// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes". /// Link: -pub fn block_wand( - mut scorers: Vec, +pub fn block_wand( + mut scorers: Vec>, mut threshold: Score, callback: &mut dyn FnMut(u32, Score) -> Score, ) { - let mut scorers: Vec = scorers + let mut scorers: Vec> = scorers .iter_mut() .map(TermScorerWithMaxScore::from) .collect(); @@ -166,10 +174,7 @@ pub fn block_wand( let block_max_score_upperbound: Score = scorers[..pivot_len] .iter_mut() - .map(|scorer| { - scorer.seek_block(pivot_doc); - scorer.block_max_score() - }) + .map(|scorer| scorer.seek_block_max(pivot_doc)) .sum(); // Beware after shallow advance, skip readers can be in advance compared to @@ -220,21 +225,22 @@ pub fn block_wand( /// - On a block, advance until the end and execute `callback` when the doc score is greater or /// equal to the `threshold`. pub fn block_wand_single_scorer( - mut scorer: TermScorer, + mut scorer: TermScorer, mut threshold: Score, callback: &mut dyn FnMut(u32, Score) -> Score, ) { let mut doc = scorer.doc(); + let mut block_max_score = scorer.seek_block_max(doc); loop { // We position the scorer on a block that can reach // the threshold. - while scorer.block_max_score() < threshold { + while block_max_score < threshold { let last_doc_in_block = scorer.last_doc_in_block(); if last_doc_in_block == TERMINATED { return; } doc = last_doc_in_block + 1; - scorer.seek_block(doc); + block_max_score = scorer.seek_block_max(doc); } // Seek will effectively load that block. doc = scorer.seek(doc); @@ -256,31 +262,33 @@ pub fn block_wand_single_scorer( } } doc += 1; - scorer.seek_block(doc); + block_max_score = scorer.seek_block_max(doc); } } -struct TermScorerWithMaxScore<'a> { - scorer: &'a mut TermScorer, +struct TermScorerWithMaxScore<'a, TPostings: PostingsWithBlockMax> { + scorer: &'a mut TermScorer, max_score: Score, } -impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> { - fn from(scorer: &'a mut TermScorer) -> Self { +impl<'a, TPostings: PostingsWithBlockMax> From<&'a mut TermScorer> + for TermScorerWithMaxScore<'a, TPostings> +{ + fn from(scorer: &'a mut TermScorer) -> Self { let max_score = scorer.max_score(); TermScorerWithMaxScore { scorer, max_score } } } -impl Deref for TermScorerWithMaxScore<'_> { - type Target = TermScorer; +impl Deref for TermScorerWithMaxScore<'_, TPostings> { + type Target = TermScorer; fn deref(&self) -> &Self::Target { self.scorer } } -impl DerefMut for TermScorerWithMaxScore<'_> { +impl DerefMut for TermScorerWithMaxScore<'_, TPostings> { fn deref_mut(&mut self) -> &mut Self::Target { self.scorer } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 062449b8a..fdb4be201 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -2,21 +2,21 @@ use std::collections::HashMap; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; -use crate::postings::FreqReadingOption; use crate::query::disjunction::Disjunction; use crate::query::explanation::does_not_match; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::term_query::TermScorer; -use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer}; +use crate::query::weight::for_each_docset_buffered; use crate::query::{ - intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur, - RequiredOptionalScorer, Scorer, Weight, + box_scorer, intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, + Explanation, Occur, RequiredOptionalScorer, Scorer, SumCombiner, Weight, }; use crate::{DocId, Score}; -enum SpecializedScorer { - TermUnion(Vec), - Other(Box), +#[derive(Copy, Clone)] +enum SumOrDoNothingCombiner { + Sum, + DoNothing, } fn scorer_disjunction( @@ -32,7 +32,7 @@ where if scorers.len() == 1 { return scorers.into_iter().next().unwrap(); // Safe unwrap. } - Box::new(Disjunction::new( + box_scorer(Disjunction::new( scorers, score_combiner, minimum_match_required, @@ -44,57 +44,60 @@ fn scorer_union( scorers: Vec>, score_combiner_fn: impl Fn() -> TScoreCombiner, num_docs: u32, -) -> SpecializedScorer +) -> Box where TScoreCombiner: ScoreCombiner, { - assert!(!scorers.is_empty()); - if scorers.len() == 1 { - return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand - } - - { - let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::()); - if is_all_term_queries { - let scorers: Vec = scorers - .into_iter() - .map(|scorer| *(scorer.downcast::().map_err(|_| ()).unwrap())) - .collect(); - if scorers - .iter() - .all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq) + match scorers.len() { + 0 => box_scorer(EmptyScorer), + 1 => scorers.into_iter().next().unwrap(), + _ => { + let combiner_opt: Option = if std::any::TypeId::of::< + TScoreCombiner, + >() == std::any::TypeId::of::< + SumCombiner, + >() { + Some(SumOrDoNothingCombiner::Sum) + } else if std::any::TypeId::of::() + == std::any::TypeId::of::() { - // Block wand is only available if we read frequencies. - return SpecializedScorer::TermUnion(scorers); + Some(SumOrDoNothingCombiner::DoNothing) } else { - return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( - scorers, - score_combiner_fn, - num_docs, - ))); + None + }; + if let Some(combiner) = combiner_opt { + if scorers.iter().all(|scorer| scorer.is::()) { + let scorers: Vec = scorers + .into_iter() + .map(|scorer| { + *scorer.downcast::().ok().expect( + "downcast failed despite the fact we already checked the type", + ) + }) + .collect(); + return match combiner { + SumOrDoNothingCombiner::Sum => box_scorer(BufferedUnionScorer::build( + scorers, + SumCombiner::default, + num_docs, + )), + SumOrDoNothingCombiner::DoNothing => { + box_scorer(BufferedUnionScorer::build( + scorers, + DoNothingCombiner::default, + num_docs, + )) + } + }; + } } + box_scorer(BufferedUnionScorer::build( + scorers, + score_combiner_fn, + num_docs, + )) } } - SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( - scorers, - score_combiner_fn, - num_docs, - ))) -} - -fn into_box_scorer( - scorer: SpecializedScorer, - score_combiner_fn: impl Fn() -> TScoreCombiner, - num_docs: u32, -) -> Box { - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let union_scorer = - BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs); - Box::new(union_scorer) - } - SpecializedScorer::Other(scorer) => scorer, - } } /// Returns the effective MUST scorer, accounting for removed AllScorers. @@ -110,7 +113,7 @@ fn effective_must_scorer( if must_scorers.is_empty() { if removed_all_scorer_count > 0 { // Had AllScorer(s) only - all docs match - Some(Box::new(AllScorer::new(max_doc))) + Some(box_scorer(AllScorer::new(max_doc))) } else { // No MUST constraint at all None @@ -128,28 +131,26 @@ fn effective_must_scorer( /// When `scoring_enabled` is false, we can just return AllScorer alone since /// we don't need score contributions from the should_scorer. fn effective_should_scorer_for_union( - should_scorer: SpecializedScorer, + should_scorer: Box, removed_all_scorer_count: usize, max_doc: DocId, num_docs: u32, score_combiner_fn: impl Fn() -> TScoreCombiner, scoring_enabled: bool, -) -> SpecializedScorer { +) -> Box { if removed_all_scorer_count > 0 { if scoring_enabled { // Need to union to get score contributions from both - let all_scorers: Vec> = vec![ - into_box_scorer(should_scorer, &score_combiner_fn, num_docs), - Box::new(AllScorer::new(max_doc)), - ]; - SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( + let all_scorers: Vec> = + vec![should_scorer, box_scorer(AllScorer::new(max_doc))]; + box_scorer(BufferedUnionScorer::build( all_scorers, score_combiner_fn, num_docs, - ))) + )) } else { // Scoring disabled - AllScorer alone is sufficient - SpecializedScorer::Other(Box::new(AllScorer::new(max_doc))) + box_scorer(AllScorer::new(max_doc)) } } else { should_scorer @@ -160,9 +161,9 @@ enum ShouldScorersCombinationMethod { // Should scorers are irrelevant. Ignored, // Only contributes to final score. - Optional(SpecializedScorer), + Optional(Box), // Regardless of score, the should scorers may impact whether a document is matching or not. - Required(SpecializedScorer), + Required(Box), } /// Weight associated to the `BoolQuery`. @@ -205,7 +206,7 @@ impl BooleanWeight { fn per_occur_scorers( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result>>> { let mut per_occur_scorers: HashMap>> = HashMap::new(); @@ -221,10 +222,10 @@ impl BooleanWeight { fn complex_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, score_combiner_fn: impl Fn() -> TComplexScoreCombiner, - ) -> crate::Result { + ) -> crate::Result> { let num_docs = reader.num_docs(); let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?; @@ -234,7 +235,7 @@ impl BooleanWeight { let must_special_scorer_counts = remove_and_count_all_and_empty_scorers(&mut must_scorers); if must_special_scorer_counts.num_empty_scorers > 0 { - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } let mut should_scorers = per_occur_scorers.remove(&Occur::Should).unwrap_or_default(); @@ -249,7 +250,7 @@ impl BooleanWeight { if exclude_special_scorer_counts.num_all_scorers > 0 { // We exclude all documents at one point. - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } let effective_minimum_number_should_match = self @@ -261,7 +262,7 @@ impl BooleanWeight { if effective_minimum_number_should_match > num_of_should_scorers { // We don't have enough scorers to satisfy the minimum number of should matches. // The request will match no documents. - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + return Ok(box_scorer(EmptyScorer)); } match effective_minimum_number_should_match { 0 if num_of_should_scorers == 0 => ShouldScorersCombinationMethod::Ignored, @@ -281,12 +282,10 @@ impl BooleanWeight { must_scorers.append(&mut should_scorers); ShouldScorersCombinationMethod::Ignored } - _ => ShouldScorersCombinationMethod::Required(SpecializedScorer::Other( - scorer_disjunction( - should_scorers, - score_combiner_fn(), - effective_minimum_number_should_match, - ), + _ => ShouldScorersCombinationMethod::Required(scorer_disjunction( + should_scorers, + score_combiner_fn(), + effective_minimum_number_should_match, )), } }; @@ -303,8 +302,8 @@ impl BooleanWeight { reader.max_doc(), num_docs, ) - .unwrap_or_else(|| Box::new(EmptyScorer)); - SpecializedScorer::Other(boxed_scorer) + .unwrap_or_else(|| box_scorer(EmptyScorer)); + boxed_scorer } (ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => { // Optional SHOULD: contributes to scoring but not required for matching. @@ -329,16 +328,12 @@ impl BooleanWeight { Some(must_scorer) => { // Has MUST constraint: SHOULD only affects scoring. if self.scoring_enabled { - SpecializedScorer::Other(Box::new(RequiredOptionalScorer::< - _, - _, - TScoreCombiner, - >::new( + box_scorer(RequiredOptionalScorer::<_, _, TScoreCombiner>::new( must_scorer, - into_box_scorer(should_scorer, &score_combiner_fn, num_docs), - ))) + should_scorer, + )) } else { - SpecializedScorer::Other(must_scorer) + must_scorer } } } @@ -358,12 +353,7 @@ impl BooleanWeight { } Some(must_scorer) => { // Has MUST constraint: intersect MUST with SHOULD. - let should_boxed = - into_box_scorer(should_scorer, &score_combiner_fn, num_docs); - SpecializedScorer::Other(intersect_scorers( - vec![must_scorer, should_boxed], - num_docs, - )) + intersect_scorers(vec![must_scorer, should_scorer], num_docs) } } } @@ -372,19 +362,18 @@ impl BooleanWeight { return Ok(include_scorer); } - let include_scorer_boxed = into_box_scorer(include_scorer, &score_combiner_fn, num_docs); let scorer: Box = if exclude_scorers.len() == 1 { let exclude_scorer = exclude_scorers.pop().unwrap(); match exclude_scorer.downcast::() { // Cast to TermScorer succeeded - Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, *exclude_scorer)), + Ok(exclude_scorer) => Box::new(Exclude::new(include_scorer, *exclude_scorer)), // We get back the original Box - Err(exclude_scorer) => Box::new(Exclude::new(include_scorer_boxed, exclude_scorer)), + Err(exclude_scorer) => Box::new(Exclude::new(include_scorer, exclude_scorer)), } } else { - Box::new(Exclude::new(include_scorer_boxed, exclude_scorers)) + Box::new(Exclude::new(include_scorer, exclude_scorers)) }; - Ok(SpecializedScorer::Other(scorer)) + Ok(scorer) } } @@ -413,8 +402,7 @@ fn remove_and_count_all_and_empty_scorers( } impl Weight for BooleanWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - let num_docs = reader.num_docs(); + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) } else if self.weights.len() == 1 { @@ -426,18 +414,12 @@ impl Weight for BooleanWeight crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); @@ -459,47 +441,22 @@ impl Weight for BooleanWeight crate::Result<()> { - let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let mut union_scorer = BufferedUnionScorer::build( - term_scorers, - &self.score_combiner_fn, - reader.num_docs(), - ); - for_each_scorer(&mut union_scorer, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_scorer(scorer.as_mut(), callback); - } - } + let mut scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; + scorer.for_each(callback); Ok(()) } fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { - let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; + let mut scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; - - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let mut union_scorer = BufferedUnionScorer::build( - term_scorers, - &self.score_combiner_fn, - reader.num_docs(), - ); - for_each_docset_buffered(&mut union_scorer, &mut buffer, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback); - } - } + for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback); Ok(()) } @@ -516,18 +473,11 @@ impl Weight for BooleanWeight Score, ) -> crate::Result<()> { let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - super::block_wand(term_scorers, threshold, callback); - } - SpecializedScorer::Other(mut scorer) => { - for_each_pruning_scorer(scorer.as_mut(), threshold, callback); - } - } + reader.for_each_pruning(threshold, scorer, callback); Ok(()) } } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 681881c11..f8d9297bd 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -1,8 +1,7 @@ -mod block_wand; +pub(crate) mod block_wand; mod boolean_query; mod boolean_weight; -pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer}; pub use self::boolean_query::BooleanQuery; pub use self::boolean_weight::BooleanWeight; @@ -16,8 +15,8 @@ mod tests { use crate::collector::{Count, TopDocs}; use crate::query::term_query::TermScorer; use crate::query::{ - AllScorer, EmptyScorer, EnableScoring, Intersection, Occur, Query, QueryParser, RangeQuery, - RequiredOptionalScorer, Scorer, SumCombiner, TermQuery, + AllScorer, BufferedUnionScorer, EmptyScorer, EnableScoring, Intersection, Occur, Query, + QueryParser, RangeQuery, RequiredOptionalScorer, Scorer, SumCombiner, TermQuery, }; use crate::schema::*; use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score}; @@ -62,6 +61,19 @@ mod tests { Ok(()) } + #[test] + pub fn test_boolean_termonly_union_specialization() -> crate::Result<()> { + let (index, text_field) = aux_test_helper()?; + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("a b")?; + let searcher = index.reader()?.searcher(); + let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?; + let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; + assert!(scorer.is::>()); + assert_eq!(query.count(&searcher)?, 4); + Ok(()) + } + #[test] pub fn test_boolean_termonly_intersection() -> crate::Result<()> { let (index, text_field) = aux_test_helper()?; diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 69847d750..40d8d7bd4 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -67,11 +67,11 @@ impl BoostWeight { } impl Weight for BoostWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { self.weight.scorer(reader, boost * self.boost) } - fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result { let underlying_explanation = self.weight.explain(reader, doc)?; let score = underlying_explanation.value() * self.boost; let mut explanation = @@ -80,7 +80,7 @@ impl Weight for BoostWeight { Ok(explanation) } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { self.weight.count(reader) } } diff --git a/src/query/const_score_query.rs b/src/query/const_score_query.rs index d07e6a96f..6ecba4a45 100644 --- a/src/query/const_score_query.rs +++ b/src/query/const_score_query.rs @@ -1,7 +1,7 @@ use std::fmt; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; -use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; /// `ConstScoreQuery` is a wrapper over a query to provide a constant score. @@ -63,12 +63,15 @@ impl ConstWeight { } impl Weight for ConstWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let inner_scorer = self.weight.scorer(reader, boost)?; - Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score))) + Ok(box_scorer(ConstScorer::new( + inner_scorer, + boost * self.score, + ))) } - fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(TantivyError::InvalidArgument(format!( @@ -81,7 +84,7 @@ impl Weight for ConstWeight { Ok(explanation) } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { self.weight.count(reader) } } diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs index 2fa1772bd..7728aa411 100644 --- a/src/query/empty_query.rs +++ b/src/query/empty_query.rs @@ -2,7 +2,7 @@ use super::Scorer; use crate::docset::TERMINATED; use crate::index::SegmentReader; use crate::query::explanation::does_not_match; -use crate::query::{EnableScoring, Explanation, Query, Weight}; +use crate::query::{box_scorer, EnableScoring, Explanation, Query, Weight}; use crate::{DocId, DocSet, Score, Searcher}; /// `EmptyQuery` is a dummy `Query` in which no document matches. @@ -26,11 +26,11 @@ impl Query for EmptyQuery { /// It is useful for tests and handling edge cases. pub struct EmptyWeight; impl Weight for EmptyWeight { - fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result> { - Ok(Box::new(EmptyScorer)) + fn scorer(&self, _reader: &dyn SegmentReader, _boost: Score) -> crate::Result> { + Ok(box_scorer(EmptyScorer)) } - fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, _reader: &dyn SegmentReader, doc: DocId) -> crate::Result { Err(does_not_match(doc)) } } diff --git a/src/query/exist_query.rs b/src/query/exist_query.rs index 7eb09722c..3b369df46 100644 --- a/src/query/exist_query.rs +++ b/src/query/exist_query.rs @@ -3,7 +3,7 @@ use core::fmt::Debug; use columnar::{ColumnIndex, DynamicColumn}; use common::BitSet; -use super::{ConstScorer, EmptyScorer}; +use super::{box_scorer, ConstScorer, EmptyScorer}; use crate::docset::{DocSet, TERMINATED}; use crate::index::SegmentReader; use crate::query::all_query::AllScorer; @@ -98,7 +98,7 @@ pub struct ExistsWeight { } impl Weight for ExistsWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let fast_field_reader = reader.fast_fields(); let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?; if self.field_type == Type::Json && self.json_subpaths { @@ -117,7 +117,7 @@ impl Weight for ExistsWeight { } } if non_empty_columns.is_empty() { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); } // If any column is full, all docs match. @@ -128,9 +128,9 @@ impl Weight for ExistsWeight { { let all_scorer = AllScorer::new(max_doc); if boost != 1.0f32 { - return Ok(Box::new(BoostScorer::new(all_scorer, boost))); + return Ok(box_scorer(BoostScorer::new(all_scorer, boost))); } else { - return Ok(Box::new(all_scorer)); + return Ok(box_scorer(all_scorer)); } } @@ -138,7 +138,7 @@ impl Weight for ExistsWeight { // NOTE: A lower number may be better for very sparse columns if non_empty_columns.len() < 4 { let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc()); - return Ok(Box::new(ConstScorer::new(docset, boost))); + return Ok(box_scorer(ConstScorer::new(docset, boost))); } // If we have many dynamic columns, precompute a bitset of matching docs @@ -162,10 +162,10 @@ impl Weight for ExistsWeight { } } let docset = BitSetDocSet::from(doc_bitset); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 64fcf78dd..46547918a 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,7 +1,7 @@ use super::size_hint::estimate_intersection; use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::query::term_query::TermScorer; -use crate::query::{EmptyScorer, Scorer}; +use crate::query::{box_scorer, EmptyScorer, Scorer}; use crate::{DocId, Score}; /// Returns the intersection scorer. @@ -20,7 +20,7 @@ pub fn intersect_scorers( num_docs_segment: u32, ) -> Box { if scorers.is_empty() { - return Box::new(EmptyScorer); + return box_scorer(EmptyScorer); } if scorers.len() == 1 { return scorers.pop().unwrap(); @@ -29,7 +29,7 @@ pub fn intersect_scorers( scorers.sort_by_key(|scorer| scorer.cost()); let doc = go_to_first_doc(&mut scorers[..]); if doc == TERMINATED { - return Box::new(EmptyScorer); + return box_scorer(EmptyScorer); } // We know that we have at least 2 elements. let left = scorers.remove(0); @@ -38,14 +38,14 @@ pub fn intersect_scorers( .iter() .all(|&scorer| scorer.is::()); if all_term_scorers { - return Box::new(Intersection { + return box_scorer(Intersection { left: *(left.downcast::().map_err(|_| ()).unwrap()), right: *(right.downcast::().map_err(|_| ()).unwrap()), others: scorers, num_docs: num_docs_segment, }); } - Box::new(Intersection { + box_scorer(Intersection { left, right, others: scorers, diff --git a/src/query/mod.rs b/src/query/mod.rs index e33768950..a0eb82b75 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -2,7 +2,7 @@ mod all_query; mod automaton_weight; mod bitset; mod bm25; -mod boolean_query; +pub(crate) mod boolean_query; mod boost_query; mod const_score_query; mod disjunction; @@ -24,7 +24,7 @@ mod reqopt_scorer; mod scorer; mod set_query; mod size_hint; -mod term_query; +pub(crate) mod term_query; mod union; mod weight; @@ -53,17 +53,17 @@ pub use self::intersection::{intersect_scorers, Intersection}; pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery}; -pub use self::phrase_query::PhraseQuery; +pub use self::phrase_query::{PhraseQuery, PhraseScorer}; pub use self::query::{EnableScoring, Query, QueryClone}; pub use self::query_parser::{QueryParser, QueryParserError}; pub use self::range_query::*; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner}; -pub use self::scorer::Scorer; +pub use self::scorer::{box_scorer, Scorer}; pub use self::set_query::TermSetQuery; -pub use self::term_query::TermQuery; -pub use self::union::BufferedUnionScorer; +pub use self::term_query::{BoxedTermScorer, TermQuery, TermScorer}; +pub use self::union::{BufferedUnionScorer, SimpleUnion}; #[cfg(test)] pub use self::vec_docset::VecDocSet; pub use self::weight::Weight; diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 550abe5a3..680e0410c 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -8,7 +8,7 @@ use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::schema::document::{Document, Value}; use crate::schema::{Field, FieldType, IndexRecordOption, Term}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer}; -use crate::{DocAddress, Result, Searcher, TantivyDocument, TantivyError}; +use crate::{DocAddress, Result, Searcher, TantivyError}; #[derive(Debug, PartialEq)] struct ScoreTerm { @@ -129,7 +129,7 @@ impl MoreLikeThis { searcher: &Searcher, doc_address: DocAddress, ) -> Result> { - let doc = searcher.doc::(doc_address)?; + let doc = searcher.doc(doc_address)?; let field_to_values = doc.get_sorted_field_values(); self.retrieve_terms_from_doc_fields(searcher, &field_to_values) @@ -167,7 +167,7 @@ impl MoreLikeThis { term_frequencies: &mut HashMap, ) -> Result<()> { let schema = searcher.schema(); - let tokenizer_manager = searcher.index().tokenizers(); + let tokenizer_manager = searcher.tokenizers(); let field_entry = schema.get_field_entry(field); if !field_entry.is_indexed() { diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index f2df3433d..6499b124c 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -2,7 +2,7 @@ use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; -use crate::query::phrase_query::{intersection_count, PhraseScorer}; +use crate::query::phrase_query::{intersection_exists, PhraseScorer}; use crate::query::Scorer; use crate::{DocId, Score}; @@ -100,7 +100,6 @@ pub struct PhrasePrefixScorer { phrase_scorer: PhraseKind, suffixes: Vec, suffix_offset: u32, - phrase_count: u32, suffix_position_buffer: Vec, } @@ -144,7 +143,6 @@ impl PhrasePrefixScorer { phrase_scorer, suffixes, suffix_offset: (max_offset - suffix_pos) as u32, - phrase_count: 0, suffix_position_buffer: Vec::with_capacity(100), }; if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() { @@ -153,12 +151,7 @@ impl PhrasePrefixScorer { phrase_prefix_scorer } - pub fn phrase_count(&self) -> u32 { - self.phrase_count - } - fn matches_prefix(&mut self) -> bool { - let mut count = 0; let current_doc = self.doc(); let pos_matching = self.phrase_scorer.get_intersection(); for suffix in &mut self.suffixes { @@ -168,11 +161,12 @@ impl PhrasePrefixScorer { let doc = suffix.seek(current_doc); if doc == current_doc { suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer); - count += intersection_count(pos_matching, &self.suffix_position_buffer); + if intersection_exists(pos_matching, &self.suffix_position_buffer) { + return true; + } } } - self.phrase_count = count as u32; - count != 0 + false } } diff --git a/src/query/phrase_prefix_query/phrase_prefix_weight.rs b/src/query/phrase_prefix_query/phrase_prefix_weight.rs index 546eb89e8..2128898d1 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs @@ -1,12 +1,11 @@ use super::{prefix_end, PhrasePrefixScorer}; use crate::fieldnorm::FieldNormReader; use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::postings::Postings; use crate::query::bm25::Bm25Weight; -use crate::query::explanation::does_not_match; -use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::{box_scorer, EmptyScorer, Scorer, Weight}; use crate::schema::{IndexRecordOption, Term}; -use crate::{DocId, DocSet, Score}; +use crate::Score; pub struct PhrasePrefixWeight { phrase_terms: Vec<(usize, Term)>, @@ -32,10 +31,10 @@ impl PhrasePrefixWeight { } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { let field = self.phrase_terms[0].1.field(); if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { return Ok(fieldnorm_reader); } } @@ -44,15 +43,15 @@ impl PhrasePrefixWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result>> { + ) -> crate::Result>> { let similarity_weight_opt = self .similarity_weight_opt .as_ref() .map(|similarity_weight| similarity_weight.boost_by(boost)); let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let mut term_postings_list = Vec::new(); + let mut term_postings_list: Vec<(usize, Box)> = Vec::new(); for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader .inverted_index(term.field())? @@ -103,49 +102,32 @@ impl PhrasePrefixWeight { } } - Ok(Some(PhrasePrefixScorer::new( + Ok(Some(box_scorer(PhrasePrefixScorer::new( term_postings_list, similarity_weight_opt, fieldnorm_reader, suffixes, self.prefix.0, - ))) + )))) } } impl Weight for PhrasePrefixWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(scorer) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { - let scorer_opt = self.phrase_scorer(reader, 1.0)?; - if scorer_opt.is_none() { - return Err(does_not_match(doc)); - } - let mut scorer = scorer_opt.unwrap(); - if scorer.seek(doc) != doc { - return Err(does_not_match(doc)); - } - let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); - let phrase_count = scorer.phrase_count(); - let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score()); - if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { - explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); - } - Ok(explanation) - } } #[cfg(test)] mod tests { use crate::docset::TERMINATED; use crate::index::Index; + use crate::postings::Postings; + use crate::query::phrase_prefix_query::PhrasePrefixScorer; use crate::query::{EnableScoring, PhrasePrefixQuery, Query}; use crate::schema::{Schema, TEXT}; use crate::{DocSet, IndexWriter, Term}; @@ -186,14 +168,14 @@ mod tests { .phrase_prefix_query_weight(enable_scoring) .unwrap() .unwrap(); - let mut phrase_scorer = phrase_weight + let mut phrase_scorer_boxed = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let phrase_scorer: &mut PhrasePrefixScorer> = + phrase_scorer_boxed.as_any_mut().downcast_mut().unwrap(); assert_eq!(phrase_scorer.doc(), 1); - assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); assert_eq!(phrase_scorer.doc(), 2); - assert_eq!(phrase_scorer.phrase_count(), 1); assert_eq!(phrase_scorer.advance(), TERMINATED); Ok(()) } @@ -213,14 +195,15 @@ mod tests { .phrase_prefix_query_weight(enable_scoring) .unwrap() .unwrap(); - let mut phrase_scorer = phrase_weight + let mut phrase_scorer_boxed = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let phrase_scorer = phrase_scorer_boxed + .downcast_mut::>>() + .unwrap(); assert_eq!(phrase_scorer.doc(), 1); - assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); assert_eq!(phrase_scorer.doc(), 2); - assert_eq!(phrase_scorer.phrase_count(), 1); assert_eq!(phrase_scorer.advance(), TERMINATED); Ok(()) } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 938e34442..fe53e8887 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -5,7 +5,7 @@ pub mod regex_phrase_query; mod regex_phrase_weight; pub use self::phrase_query::PhraseQuery; -pub(crate) use self::phrase_scorer::intersection_count; +pub(crate) use self::phrase_scorer::intersection_exists; pub use self::phrase_scorer::PhraseScorer; pub use self::phrase_weight::PhraseWeight; diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 11321173c..adc7ce53d 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -126,7 +126,7 @@ impl PhraseQuery { }; let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt); if self.slop > 0 { - weight.slop(self.slop); + weight.set_slop(self.slop); } Ok(weight) } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index ff7def917..d1f9f887e 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -2,9 +2,9 @@ use std::cmp::Ordering; use crate::docset::{DocSet, SeekDangerResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; -use crate::postings::Postings; +use crate::postings::{Postings, SegmentPostings as StandardPostings}; use crate::query::bm25::Bm25Weight; -use crate::query::{Intersection, Scorer}; +use crate::query::{Explanation, Intersection, Scorer}; use crate::{DocId, Score}; struct PostingsWithOffset { @@ -43,7 +43,14 @@ impl DocSet for PostingsWithOffset { } } -pub struct PhraseScorer { +/// `PhraseScorer` is a `Scorer` that matches documents that match a phrase query, and scores them +/// based on the number of times the phrase appears in the document and the fieldnorm of the +/// document. +/// +/// It is implemented as an intersection of the postings of each term in the +/// phrase, where the intersection condition is that the positions of the terms are next to each +/// other (or within a certain slop). +pub struct PhraseScorer { intersection_docset: Intersection, PostingsWithOffset>, num_terms: usize, left_positions: Vec, @@ -58,7 +65,7 @@ pub struct PhraseScorer { } /// Returns true if and only if the two sorted arrays contain a common element -fn intersection_exists(left: &[u32], right: &[u32]) -> bool { +pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool { let mut left_index = 0; let mut right_index = 0; while left_index < left.len() && right_index < right.len() { @@ -79,7 +86,7 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool { false } -pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize { +fn intersection_count(left: &[u32], right: &[u32]) -> usize { let mut left_index = 0; let mut right_index = 0; let mut count = 0; @@ -346,6 +353,9 @@ fn intersection_count_with_carrying_slop( impl PhraseScorer { // If similarity_weight is None, then scoring is disabled. + /// Creates a phrase scorer from term postings and phrase matching options. + /// + /// `slop` controls the maximum positional distance allowed between terms. pub fn new( term_postings: Vec<(usize, TPostings)>, similarity_weight_opt: Option, @@ -402,6 +412,7 @@ impl PhraseScorer { scorer } + /// Returns the number of phrases identified in the current matching doc. pub fn phrase_count(&self) -> u32 { self.phrase_count } @@ -584,6 +595,17 @@ impl Scorer for PhraseScorer { 1.0f32 } } + + fn explain(&mut self) -> Explanation { + let doc = self.doc(); + let phrase_count = self.phrase_count(); + let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc); + let mut explanation = Explanation::new("Phrase Scorer", self.score()); + if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { + explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); + } + explanation + } } #[cfg(test)] diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 4118f79f6..cf9326bc9 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -1,13 +1,43 @@ -use super::PhraseScorer; +use std::io; + use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; +use crate::postings::TermInfo; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; -use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; -use crate::schema::{IndexRecordOption, Term}; +use crate::query::{box_scorer, EmptyScorer, Explanation, Scorer, Weight}; +use crate::schema::Term; use crate::{DocId, DocSet, Score}; +struct BuildPhraseScorer<'a> { + term_infos: &'a [(usize, TermInfo)], + similarity_weight_opt: Option, + fieldnorm_reader: FieldNormReader, + slop: u32, +} + +impl TypedInvertedIndexReaderCb>> for BuildPhraseScorer<'_> { + fn call(&mut self, reader: &I) -> io::Result> { + let mut offset_and_term_postings = Vec::with_capacity(self.term_infos.len()); + for (offset, term_info) in self.term_infos { + let postings = reader.read_postings_from_terminfo( + term_info, + crate::schema::IndexRecordOption::WithFreqsAndPositions, + )?; + offset_and_term_postings.push((*offset, postings)); + } + let scorer = super::PhraseScorer::new( + offset_and_term_postings, + self.similarity_weight_opt.clone(), + self.fieldnorm_reader.clone(), + self.slop, + ); + Ok(box_scorer(scorer)) + } +} + pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, similarity_weight_opt: Option, @@ -21,18 +51,17 @@ impl PhraseWeight { phrase_terms: Vec<(usize, Term)>, similarity_weight_opt: Option, ) -> PhraseWeight { - let slop = 0; PhraseWeight { phrase_terms, similarity_weight_opt, - slop, + slop: 0, } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { let field = self.phrase_terms[0].1.field(); if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(field) { return Ok(fieldnorm_reader); } } @@ -41,48 +70,69 @@ impl PhraseWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result>> { + ) -> crate::Result>> { let similarity_weight_opt = self .similarity_weight_opt .as_ref() .map(|similarity_weight| similarity_weight.boost_by(boost)); let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let mut term_postings_list = Vec::new(); - for &(offset, ref term) in &self.phrase_terms { - if let Some(postings) = reader - .inverted_index(term.field())? - .read_postings(term, IndexRecordOption::WithFreqsAndPositions)? - { - term_postings_list.push((offset, postings)); - } else { - return Ok(None); - } + + if self.phrase_terms.is_empty() { + return Ok(None); } - Ok(Some(PhraseScorer::new( - term_postings_list, + let field = self.phrase_terms[0].1.field(); + + if !self + .phrase_terms + .iter() + .all(|(_offset, term)| term.field() == field) + { + return Err(crate::TantivyError::InvalidArgument( + "All terms in a phrase query must belong to the same field".to_string(), + )); + } + + let inverted_index_reader = reader.inverted_index(field)?; + + let mut term_infos: Vec<(usize, TermInfo)> = Vec::with_capacity(self.phrase_terms.len()); + + for &(offset, ref term) in &self.phrase_terms { + let Some(term_info) = inverted_index_reader.get_term_info(term)? else { + return Ok(None); + }; + term_infos.push((offset, term_info)); + } + + let mut phrase_scorer_builder = BuildPhraseScorer { + term_infos: &term_infos, similarity_weight_opt, fieldnorm_reader, - self.slop, - ))) + slop: self.slop, + }; + let scorer = + try_downcast_and_call(inverted_index_reader.as_ref(), &mut phrase_scorer_builder)?; + + Ok(Some(scorer)) } - pub fn slop(&mut self, slop: u32) { + /// Sets the slop for the given PhraseWeight. + pub fn set_slop(&mut self, slop: u32) { self.slop = slop; } } impl Weight for PhraseWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(scorer) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let scorer_opt = self.phrase_scorer(reader, 1.0)?; if scorer_opt.is_none() { return Err(does_not_match(doc)); @@ -91,14 +141,7 @@ impl Weight for PhraseWeight { if scorer.seek(doc) != doc { return Err(does_not_match(doc)); } - let fieldnorm_reader = self.fieldnorm_reader(reader)?; - let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); - let phrase_count = scorer.phrase_count(); - let mut explanation = Explanation::new("Phrase Scorer", scorer.score()); - if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { - explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); - } - Ok(explanation) + Ok(scorer.explain()) } } @@ -106,7 +149,8 @@ impl Weight for PhraseWeight { mod tests { use super::super::tests::create_index; use crate::docset::TERMINATED; - use crate::query::{EnableScoring, PhraseQuery}; + use crate::query::phrase_query::PhraseScorer; + use crate::query::{EnableScoring, PhraseQuery, Scorer}; use crate::{DocSet, Term}; #[test] @@ -121,9 +165,11 @@ mod tests { ]); let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); let phrase_weight = phrase_query.phrase_weight(enable_scoring).unwrap(); - let mut phrase_scorer = phrase_weight + let phrase_scorer_boxed: Box = phrase_weight .phrase_scorer(searcher.segment_reader(0u32), 1.0)? .unwrap(); + let mut phrase_scorer: Box = + phrase_scorer_boxed.downcast::().ok().unwrap(); assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.phrase_count(), 2); assert_eq!(phrase_scorer.advance(), 2); diff --git a/src/query/phrase_query/regex_phrase_weight.rs b/src/query/phrase_query/regex_phrase_weight.rs index 9cefc555a..9facb22a5 100644 --- a/src/query/phrase_query/regex_phrase_weight.rs +++ b/src/query/phrase_query/regex_phrase_weight.rs @@ -5,14 +5,16 @@ use tantivy_fst::Regex; use super::PhraseScorer; use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::{LoadedPostings, Postings, SegmentPostings, TermInfo}; +use crate::index::{InvertedIndexReader, SegmentReader}; +use crate::postings::{LoadedPostings, Postings, TermInfo}; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; use crate::query::union::{BitSetPostingUnion, SimpleUnion}; -use crate::query::{AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::{ + box_scorer, AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight, +}; use crate::schema::{Field, IndexRecordOption}; -use crate::{DocId, DocSet, InvertedIndexReader, Score}; +use crate::{DocId, DocSet, DynInvertedIndexReader, Score}; type UnionType = SimpleUnion>; @@ -45,9 +47,9 @@ impl RegexPhraseWeight { } } - fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result { if self.similarity_weight_opt.is_some() { - if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? { + if let Ok(fieldnorm_reader) = reader.get_fieldnorms_reader(self.field) { return Ok(fieldnorm_reader); } } @@ -56,7 +58,7 @@ impl RegexPhraseWeight { pub(crate) fn phrase_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result>> { let similarity_weight_opt = self @@ -84,7 +86,8 @@ impl RegexPhraseWeight { "Phrase query exceeded max expansions {num_terms}" ))); } - let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?; + let union = + Self::get_union_from_term_infos(&term_infos, reader, inverted_index.as_ref())?; posting_lists.push((offset, union)); } @@ -99,22 +102,11 @@ impl RegexPhraseWeight { /// Add all docs of the term to the docset fn add_to_bitset( - inverted_index: &InvertedIndexReader, + inverted_index: &(impl InvertedIndexReader + ?Sized), term_info: &TermInfo, doc_bitset: &mut BitSet, ) -> crate::Result<()> { - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - for &doc in docs { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); - } + inverted_index.fill_bitset_from_terminfo(term_info, doc_bitset)?; Ok(()) } @@ -174,8 +166,8 @@ impl RegexPhraseWeight { /// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently. pub(crate) fn get_union_from_term_infos( term_infos: &[TermInfo], - reader: &SegmentReader, - inverted_index: &InvertedIndexReader, + reader: &dyn SegmentReader, + inverted_index: &dyn DynInvertedIndexReader, ) -> crate::Result { let max_doc = reader.max_doc(); @@ -188,16 +180,19 @@ impl RegexPhraseWeight { // - Bucket 1: Terms appearing in 0.1% to 1% of documents // - Bucket 2: Terms appearing in 1% to 10% of documents // - Bucket 3: Terms appearing in more than 10% of documents - let mut buckets: Vec<(BitSet, Vec)> = (0..4) + let mut buckets: Vec<(BitSet, Vec>)> = (0..4) .map(|_| (BitSet::with_max_value(max_doc), Vec::new())) .collect(); const SPARSE_TERM_DOC_THRESHOLD: u32 = 100; for term_info in term_infos { - let mut term_posting = inverted_index - .read_postings_from_terminfo(term_info, IndexRecordOption::WithFreqsAndPositions)?; - let num_docs = term_posting.doc_freq(); + let mut term_posting = crate::index::load_postings_from_terminfo( + inverted_index, + term_info, + IndexRecordOption::WithFreqsAndPositions, + )?; + let num_docs = u32::from(term_posting.doc_freq()); if num_docs < SPARSE_TERM_DOC_THRESHOLD { let current_bucket = &mut sparse_buckets[0]; @@ -269,15 +264,15 @@ impl RegexPhraseWeight { } impl Weight for RegexPhraseWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + Ok(box_scorer(scorer)) } else { - Ok(Box::new(EmptyScorer)) + Ok(box_scorer(EmptyScorer)) } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let scorer_opt = self.phrase_scorer(reader, 1.0)?; if scorer_opt.is_none() { return Err(does_not_match(doc)); diff --git a/src/query/query.rs b/src/query/query.rs index 32f74536f..476887d24 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -146,7 +146,7 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug { let weight = self.weight(EnableScoring::disabled_from_searcher(searcher))?; let mut result = 0; for reader in searcher.segment_readers() { - result += weight.count(reader)? as usize; + result += weight.count(reader.as_ref())? as usize; } Ok(result) } diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index a597c8dca..ffdec13d8 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -5,13 +5,15 @@ use common::bounds::{map_bound, BoundsRange}; use common::BitSet; use super::range_query_fastfield::FastFieldRangeWeight; -use crate::index::SegmentReader; +use crate::index::{InvertedIndexReader as _, SegmentReader}; use crate::query::explanation::does_not_match; use crate::query::range_query::is_type_valid_for_fastfield_range_query; -use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight}; -use crate::schema::{Field, IndexRecordOption, Term, Type}; +use crate::query::{ + box_scorer, BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight, +}; +use crate::schema::{Field, Term, Type}; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score}; +use crate::{DocId, DocSet, Score}; /// `RangeQuery` matches all documents that have at least one term within a defined range. /// @@ -212,7 +214,7 @@ impl InvertedIndexRangeWeight { } impl Weight for InvertedIndexRangeWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); @@ -228,24 +230,13 @@ impl Weight for InvertedIndexRangeWeight { } processed_count += 1; let term_info = term_range.value(); - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; - loop { - let docs = block_segment_postings.docs(); - if docs.is_empty() { - break; - } - for &doc in block_segment_postings.docs() { - doc_bitset.insert(doc); - } - block_segment_postings.advance(); - } + inverted_index.fill_bitset_from_terminfo(term_info, &mut doc_bitset)?; } let doc_bitset = BitSetDocSet::from(doc_bitset); - Ok(Box::new(ConstScorer::new(doc_bitset, boost))) + Ok(box_scorer(ConstScorer::new(doc_bitset, boost))) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); @@ -686,7 +677,7 @@ mod tests { .weight(EnableScoring::disabled_from_schema(&schema)) .unwrap(); let range_scorer = range_weight - .scorer(&searcher.segment_readers()[0], 1.0f32) + .scorer(searcher.segment_readers()[0].as_ref(), 1.0f32) .unwrap(); range_scorer }; diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index 5ac715277..62d744a45 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -13,7 +13,8 @@ use common::bounds::{BoundsRange, TransformBound}; use super::fast_field_range_doc_set::RangeDocSet; use crate::query::{ - AllScorer, ConstScorer, EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight, + box_scorer, AllScorer, ConstScorer, EmptyScorer, EnableScoring, Explanation, Query, Scorer, + Weight, }; use crate::schema::{Type, ValueBytes}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; @@ -52,10 +53,10 @@ impl FastFieldRangeWeight { } impl Weight for FastFieldRangeWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { // Check if both bounds are Bound::Unbounded if self.bounds.is_unbounded() { - return Ok(Box::new(AllScorer::new(reader.max_doc()))); + return Ok(box_scorer(AllScorer::new(reader.max_doc()))); } let term = self @@ -95,7 +96,7 @@ impl Weight for FastFieldRangeWeight { let Some(str_dict_column): Option = reader.fast_fields().str(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let dict = str_dict_column.dictionary(); @@ -107,7 +108,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader .u64_lenient_for_type(Some(&[ColumnType::Str]), &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) } @@ -119,7 +120,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader .u64_lenient_for_type(Some(&[ColumnType::DateTime]), &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let bounds = bounds.map_bound(|term| term.as_date().unwrap().to_u64()); search_on_u64_ff( @@ -146,7 +147,7 @@ impl Weight for FastFieldRangeWeight { let Some(ip_addr_column): Option> = reader.fast_fields().column_opt(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let value_range = bound_range_inclusive_ip( &bounds.lower_bound, @@ -155,11 +156,11 @@ impl Weight for FastFieldRangeWeight { ip_addr_column.max_value(), ); let docset = RangeDocSet::new(value_range, ip_addr_column); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } else if field_type.is_str() { let Some(str_dict_column): Option = reader.fast_fields().str(&field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let dict = str_dict_column.dictionary(); @@ -171,7 +172,7 @@ impl Weight for FastFieldRangeWeight { let Some((column, _col_type)) = fast_field_reader.u64_lenient_for_type(None, &field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) } else if field_type.is_bytes() { @@ -228,7 +229,7 @@ impl Weight for FastFieldRangeWeight { &field_name, )? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; search_on_u64_ff( column, @@ -238,7 +239,7 @@ impl Weight for FastFieldRangeWeight { } } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) != doc { return Err(TantivyError::InvalidArgument(format!( @@ -255,7 +256,7 @@ impl Weight for FastFieldRangeWeight { /// /// Convert into fast field value space and search. fn search_on_json_numerical_field( - reader: &SegmentReader, + reader: &dyn SegmentReader, field_name: &str, typ: Type, bounds: BoundsRange>>, @@ -269,7 +270,7 @@ fn search_on_json_numerical_field( let Some((column, col_type)) = fast_field_reader.u64_lenient_for_type(allowed_column_types, field_name)? else { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); }; let actual_column_type: NumericalType = col_type .numerical_type() @@ -427,18 +428,18 @@ fn search_on_u64_ff( ) .unwrap_or(1..=0); // empty range if value_range.is_empty() { - return Ok(Box::new(EmptyScorer)); + return Ok(box_scorer(EmptyScorer)); } if col_min_value >= *value_range.start() && col_max_value <= *value_range.end() { // all values in the column are within the range. if column.index.get_cardinality() == Cardinality::Full { if boost != 1.0f32 { - return Ok(Box::new(ConstScorer::new( + return Ok(box_scorer(ConstScorer::new( AllScorer::new(column.num_docs()), boost, ))); } else { - return Ok(Box::new(AllScorer::new(column.num_docs()))); + return Ok(box_scorer(AllScorer::new(column.num_docs()))); } } else { // TODO Make it a field presence request for that specific column @@ -446,7 +447,7 @@ fn search_on_u64_ff( } let docset = RangeDocSet::new(value_range, column); - Ok(Box::new(ConstScorer::new(docset, boost))) + Ok(box_scorer(ConstScorer::new(docset, boost))) } /// Returns true if the type maps to a u64 fast field diff --git a/src/query/scorer.rs b/src/query/scorer.rs index e91fc2fbc..b4fcdfa47 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -1,9 +1,11 @@ +use std::mem::{transmute_copy, ManuallyDrop}; use std::ops::DerefMut; use downcast_rs::impl_downcast; use crate::docset::DocSet; -use crate::Score; +use crate::query::Explanation; +use crate::{DocId, Score, TERMINATED}; /// Scored set of documents matching a query within a specific segment. /// @@ -13,6 +15,53 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static { /// /// This method will perform a bit of computation and is not cached. fn score(&mut self) -> Score; + + /// Calls `callback` with all of the `(doc, score)` for which score + /// is exceeding a given threshold. + /// + /// This method is useful for the TopDocs collector. + /// For all docsets, the blanket implementation has the benefit + /// of prefiltering (doc, score) pairs, avoiding the + /// virtual dispatch cost. + /// + /// More importantly, it makes it possible for scorers to implement + /// important optimization (e.g. BlockWAND for union). + fn for_each_pruning( + &mut self, + threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + for_each_pruning_scorer_default_impl(self, threshold, callback); + } + + /// Calls `callback` with all of the `(doc, score)` in the scorer. + fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) { + let mut doc = self.doc(); + while doc != TERMINATED { + callback(doc, self.score()); + doc = self.advance(); + } + } + + /// Returns an explanation for the score of the current document. + fn explain(&mut self) -> Explanation { + let score = self.score(); + let name = std::any::type_name_of_val(self); + Explanation::new(name, score) + } +} + +/// Boxes a scorer. Prefer this to Box::new as it avoids double boxing +/// when TScorer is already a Box. +pub fn box_scorer(scorer: TScorer) -> Box { + if std::any::TypeId::of::() == std::any::TypeId::of::>() { + unsafe { + let forget_me = ManuallyDrop::new(scorer); + transmute_copy::>(&forget_me) + } + } else { + Box::new(scorer) + } } impl_downcast!(Scorer); @@ -22,4 +71,41 @@ impl Scorer for Box { fn score(&mut self) -> Score { self.deref_mut().score() } + + fn for_each_pruning( + &mut self, + threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, + ) { + self.deref_mut().for_each_pruning(threshold, callback); + } + + fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) { + self.deref_mut().for_each(callback); + } +} + +/// Calls `callback` with all of the `(doc, score)` for which score +/// is exceeding a given threshold. +/// +/// This method is useful for the [`TopDocs`](crate::collector::TopDocs) collector. +/// For all docsets, the blanket implementation has the benefit +/// of prefiltering (doc, score) pairs, avoiding the +/// virtual dispatch cost. +/// +/// More importantly, it makes it possible for scorers to implement +/// important optimization (e.g. BlockWAND for union). +pub(crate) fn for_each_pruning_scorer_default_impl( + scorer: &mut TScorer, + mut threshold: Score, + callback: &mut dyn FnMut(DocId, Score) -> Score, +) { + let mut doc = scorer.doc(); + while doc != TERMINATED { + let score = scorer.score(); + if score > threshold { + threshold = callback(doc, score); + } + doc = scorer.advance(); + } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 0811725be..0f9978b62 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -3,10 +3,10 @@ mod term_scorer; mod term_weight; pub use self::term_query::TermQuery; -pub use self::term_scorer::TermScorer; +pub use self::term_scorer::{BoxedTermScorer, TermScorer}; + #[cfg(test)] mod tests { - use crate::collector::TopDocs; use crate::docset::DocSet; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index a75648348..8da548662 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -1,23 +1,44 @@ use crate::docset::DocSet; use crate::fieldnorm::FieldNormReader; -use crate::postings::{FreqReadingOption, Postings, SegmentPostings}; +use crate::postings::{Postings, PostingsWithBlockMax, SegmentPostings}; use crate::query::bm25::Bm25Weight; -use crate::query::{Explanation, Scorer}; +use crate::query::{box_scorer, Explanation, Scorer}; use crate::{DocId, Score}; +/// Type-erased term scorer guaranteed to wrap a Tantivy [`TermScorer`]. +pub struct BoxedTermScorer(Box); + +impl BoxedTermScorer { + /// Creates a boxed term scorer from a concrete Tantivy [`TermScorer`]. + pub fn new(term_scorer: TermScorer) -> BoxedTermScorer { + BoxedTermScorer(box_scorer(term_scorer)) + } + + /// Converts this boxed term scorer into a generic boxed scorer. + pub fn into_boxed_scorer(self) -> Box { + self.0 + } +} + #[derive(Clone)] -pub struct TermScorer { - postings: SegmentPostings, +/// Scorer for a single term over a postings list. +/// +/// `TermScorer` combines postings data, fieldnorms, and BM25 term weight to +/// produce per-document scores. +pub struct TermScorer { + postings: TPostings, fieldnorm_reader: FieldNormReader, similarity_weight: Bm25Weight, } -impl TermScorer { +impl TermScorer { + /// Creates a new term scorer from postings, fieldnorm reader, and BM25 + /// term weight. pub fn new( - postings: SegmentPostings, + postings: TPostings, fieldnorm_reader: FieldNormReader, similarity_weight: Bm25Weight, - ) -> TermScorer { + ) -> TermScorer { TermScorer { postings, fieldnorm_reader, @@ -25,10 +46,38 @@ impl TermScorer { } } - pub(crate) fn seek_block(&mut self, target_doc: DocId) { - self.postings.block_cursor.seek_block(target_doc); + /// Returns the term frequency for the current document. + pub fn term_freq(&self) -> u32 { + self.postings.term_freq() } + /// Returns the fieldnorm id for the current document. + pub fn fieldnorm_id(&self) -> u8 { + self.fieldnorm_reader.fieldnorm_id(self.doc()) + } + + /// Returns the maximum score upper bound for this scorer. + pub fn max_score(&self) -> Score { + self.similarity_weight.max_score() + } +} + +impl TermScorer { + pub(crate) fn last_doc_in_block(&self) -> DocId { + self.postings.last_doc_in_block() + } + + /// Advances the term scorer to the block containing target_doc and returns + /// an upperbound for the score all of the documents in the block. + /// (BlockMax). This score is not guaranteed to be the + /// effective maximum score of the block. + pub(crate) fn seek_block_max(&mut self, target_doc: DocId) -> Score { + self.postings + .seek_block_max(target_doc, &self.fieldnorm_reader, &self.similarity_weight) + } +} + +impl TermScorer { #[cfg(test)] pub fn create_for_test( doc_and_tfs: &[(DocId, u32)], @@ -49,55 +98,9 @@ impl TermScorer { let fieldnorm_reader = FieldNormReader::for_test(fieldnorms); TermScorer::new(segment_postings, fieldnorm_reader, similarity_weight) } - - /// See `FreqReadingOption`. - pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { - self.postings.block_cursor.freq_reading_option() - } - - /// Returns the maximum score for the current block. - /// - /// In some rare case, the result may not be exact. In this case a lower value is returned, - /// (and may lead us to return a lesser document). - /// - /// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the - /// score assuming the average fieldnorm computed on this segment. - /// - /// Though extremely rare, it is theoretically possible that the actual average fieldnorm - /// is different enough from the current segment average fieldnorm that the maximum over a - /// specific is achieved on a different document. - /// - /// (The result is on the other hand guaranteed to be correct if there is only one segment). - pub fn block_max_score(&mut self) -> Score { - self.postings - .block_cursor - .block_max_score(&self.fieldnorm_reader, &self.similarity_weight) - } - - pub fn term_freq(&self) -> u32 { - self.postings.term_freq() - } - - pub fn fieldnorm_id(&self) -> u8 { - self.fieldnorm_reader.fieldnorm_id(self.doc()) - } - - pub fn explain(&self) -> Explanation { - let fieldnorm_id = self.fieldnorm_id(); - let term_freq = self.term_freq(); - self.similarity_weight.explain(fieldnorm_id, term_freq) - } - - pub fn max_score(&self) -> Score { - self.similarity_weight.max_score() - } - - pub fn last_doc_in_block(&self) -> DocId { - self.postings.block_cursor.skip_reader().last_doc_in_block() - } } -impl DocSet for TermScorer { +impl DocSet for TermScorer { #[inline] fn advance(&mut self) -> DocId { self.postings.advance() @@ -119,13 +122,19 @@ impl DocSet for TermScorer { } } -impl Scorer for TermScorer { +impl Scorer for TermScorer { #[inline] fn score(&mut self) -> Score { let fieldnorm_id = self.fieldnorm_id(); let term_freq = self.term_freq(); self.similarity_weight.score(fieldnorm_id, term_freq) } + + fn explain(&mut self) -> Explanation { + let fieldnorm_id = self.fieldnorm_id(); + let term_freq = self.term_freq(); + self.similarity_weight.explain(fieldnorm_id, term_freq) + } } #[cfg(test)] @@ -134,7 +143,7 @@ mod tests { use crate::index::SegmentId; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; - use crate::merge_policy::NoMergePolicy; + use crate::indexer::NoMergePolicy; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::query::term_query::TermScorer; use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery}; @@ -155,7 +164,7 @@ mod tests { crate::assert_nearly_equals!(max_scorer, 1.3990127); assert_eq!(term_scorer.doc(), 2); assert_eq!(term_scorer.term_freq(), 3); - assert_nearly_equals!(term_scorer.block_max_score(), 1.3676447); + assert_nearly_equals!(term_scorer.seek_block_max(2), 1.3676447); assert_nearly_equals!(term_scorer.score(), 1.0892314); assert_eq!(term_scorer.advance(), 3); assert_eq!(term_scorer.doc(), 3); @@ -170,9 +179,9 @@ mod tests { } #[test] - fn test_term_scorer_shallow_advance() -> crate::Result<()> { + fn test_term_scorer_shallow_advance() { let bm25_weight = Bm25Weight::for_one_term(300, 1024, 10.0); - let mut doc_and_tfs = vec![]; + let mut doc_and_tfs = Vec::new(); for i in 0u32..300u32 { let doc = i * 10; doc_and_tfs.push((doc, 1u32 + doc % 3u32)); @@ -180,11 +189,10 @@ mod tests { let fieldnorms: Vec = std::iter::repeat_n(10u32, 3_000).collect(); let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight); assert_eq!(term_scorer.doc(), 0u32); - term_scorer.seek_block(1289); + term_scorer.seek_block_max(1289); assert_eq!(term_scorer.doc(), 0u32); term_scorer.seek(1289); assert_eq!(term_scorer.doc(), 1290); - Ok(()) } proptest! { @@ -218,7 +226,7 @@ mod tests { let docs: Vec = (0..term_doc_freq).map(|doc| doc as DocId).collect(); for block in docs.chunks(COMPRESSION_BLOCK_SIZE) { - let block_max_score: Score = term_scorer.block_max_score(); + let block_max_score: Score = term_scorer.seek_block_max(0); let mut block_max_score_computed: Score = 0.0; for &doc in block { assert_eq!(term_scorer.doc(), doc); @@ -246,25 +254,26 @@ mod tests { let fieldnorms: Vec = std::iter::repeat_n(20u32, 300).collect(); let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0); let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight); - assert_nearly_equals!(docs.block_max_score(), 2.5161593); - docs.seek_block(135); - assert_nearly_equals!(docs.block_max_score(), 3.4597192); - docs.seek_block(256); + assert_nearly_equals!(docs.seek_block_max(0), 2.5161593); + assert_nearly_equals!(docs.seek_block_max(135), 3.4597192); // the block is not loaded yet. - assert_nearly_equals!(docs.block_max_score(), 5.2971773); + assert_nearly_equals!(docs.seek_block_max(256), 5.2971773); assert_eq!(256, docs.seek(256)); - assert_nearly_equals!(docs.block_max_score(), 3.9539647); + assert_nearly_equals!(docs.seek_block_max(256), 3.9539647); } - fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> { - let term_weight = - term_query.specialized_weight(EnableScoring::enabled_from_searcher(searcher))?; + fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) { + let term_weight = term_query + .specialized_weight(EnableScoring::enabled_from_searcher(searcher)) + .unwrap(); for reader in searcher.segment_readers() { let mut block_max_scores = vec![]; let mut block_max_scores_b = vec![]; let mut docs = vec![]; { - let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap(); + let mut term_scorer = term_weight + .term_scorer_for_test(reader.as_ref(), 1.0) + .unwrap(); while term_scorer.doc() != TERMINATED { let mut score = term_scorer.score(); docs.push(term_scorer.doc()); @@ -278,10 +287,12 @@ mod tests { } } { - let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap(); + let mut term_scorer = term_weight + .term_scorer_for_test(reader.as_ref(), 1.0) + .unwrap(); for d in docs { - term_scorer.seek_block(d); - block_max_scores_b.push(term_scorer.block_max_score()); + let block_max_score = term_scorer.seek_block_max(d); + block_max_scores_b.push(block_max_score); } } for (l, r) in block_max_scores @@ -292,18 +303,18 @@ mod tests { assert_nearly_equals!(l, r); } } - Ok(()) } #[ignore] #[test] - fn test_block_wand_long_test() -> crate::Result<()> { + fn test_block_wand_long_test() { let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut writer: IndexWriter = - index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?; + let mut writer: IndexWriter = index + .writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN) + .unwrap(); use rand::Rng; let mut rng = rand::rng(); writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -311,15 +322,15 @@ mod tests { let term_freq = rng.random_range(1..10000); let words: Vec<&str> = std::iter::repeat_n("bbbb", term_freq).collect(); let text = words.join(" "); - writer.add_document(doc!(text_field=>text))?; + writer.add_document(doc!(text_field=>text)).unwrap(); } - writer.commit()?; + writer.commit().unwrap(); let term_query = TermQuery::new( Term::from_field_text(text_field, "bbbb"), IndexRecordOption::WithFreqs, ); let segment_ids: Vec; - let reader = index.reader()?; + let reader = index.reader().unwrap(); { let searcher = reader.searcher(); segment_ids = searcher @@ -327,15 +338,14 @@ mod tests { .iter() .map(|segment| segment.segment_id()) .collect(); - test_block_wand_aux(&term_query, &searcher)?; + test_block_wand_aux(&term_query, &searcher); } writer.merge(&segment_ids[..]).wait().unwrap(); { - reader.reload()?; + reader.reload().unwrap(); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); - test_block_wand_aux(&term_query, &searcher)?; + test_block_wand_aux(&term_query, &searcher); } - Ok(()) } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 89b527cca..3dccdb7a2 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -1,12 +1,17 @@ -use super::term_scorer::TermScorer; +use std::io; + use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN}; use crate::fieldnorm::FieldNormReader; -use crate::index::SegmentReader; -use crate::postings::SegmentPostings; +use crate::index::{ + try_downcast_and_call, InvertedIndexReader, SegmentReader, TypedInvertedIndexReaderCb, +}; +use crate::postings::TermInfo; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; -use crate::query::weight::{for_each_docset_buffered, for_each_scorer}; -use crate::query::{AllScorer, AllWeight, EmptyScorer, Explanation, Scorer, Weight}; +use crate::query::weight::for_each_docset_buffered; +use crate::query::{ + box_scorer, AllScorer, AllWeight, BoxedTermScorer, EmptyScorer, Explanation, Scorer, Weight, +}; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TantivyError, Term}; @@ -18,29 +23,58 @@ pub struct TermWeight { } enum TermOrEmptyOrAllScorer { - TermScorer(Box), + TermScorer(BoxedTermScorer), Empty, AllMatch(AllScorer), } +struct BuildTermScorer<'a> { + term_info: &'a TermInfo, + option: IndexRecordOption, + fieldnorm_reader: FieldNormReader, + similarity_weight: Bm25Weight, +} + +impl TypedInvertedIndexReaderCb> for BuildTermScorer<'_> { + fn call(&mut self, reader: &I) -> io::Result { + let postings = reader.read_postings_from_terminfo(self.term_info, self.option)?; + self.build_scorer(postings) + } +} + +impl BuildTermScorer<'_> { + fn build_scorer( + &self, + postings: TPostings, + ) -> io::Result { + let term_scorer = super::TermScorer::new( + postings, + self.fieldnorm_reader.clone(), + self.similarity_weight.clone(), + ); + Ok(BoxedTermScorer::new(term_scorer)) + } +} + impl TermOrEmptyOrAllScorer { pub fn into_boxed_scorer(self) -> Box { match self { - TermOrEmptyOrAllScorer::TermScorer(scorer) => scorer, - TermOrEmptyOrAllScorer::Empty => Box::new(EmptyScorer), - TermOrEmptyOrAllScorer::AllMatch(scorer) => Box::new(scorer), + TermOrEmptyOrAllScorer::TermScorer(scorer) => scorer.into_boxed_scorer(), + TermOrEmptyOrAllScorer::Empty => box_scorer(EmptyScorer), + TermOrEmptyOrAllScorer::AllMatch(scorer) => box_scorer(scorer), } } } impl Weight for TermWeight { - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result> { Ok(self.specialized_scorer(reader, boost)?.into_boxed_scorer()) } - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); if term_scorer.doc() > doc || term_scorer.seek(doc) != doc { return Err(does_not_match(doc)); } @@ -53,7 +87,7 @@ impl Weight for TermWeight { } } - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { if let Some(alive_bitset) = reader.alive_bitset() { Ok(self.scorer(reader, 1.0)?.count(alive_bitset)) } else { @@ -68,16 +102,17 @@ impl Weight for TermWeight { /// `DocSet` and push the scored documents to the collector. fn for_each( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { - for_each_scorer(&mut *term_scorer, callback); + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); + term_scorer.for_each(callback); } TermOrEmptyOrAllScorer::Empty => {} TermOrEmptyOrAllScorer::AllMatch(mut all_scorer) => { - for_each_scorer(&mut all_scorer, callback); + all_scorer.for_each(callback); } } Ok(()) @@ -87,11 +122,12 @@ impl Weight for TermWeight { /// `DocSet` and push the scored documents to the collector. fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { match self.specialized_scorer(reader, 1.0)? { - TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let mut term_scorer = term_scorer.into_boxed_scorer(); let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; for_each_docset_buffered(&mut term_scorer, &mut buffer, callback); } @@ -118,17 +154,13 @@ impl Weight for TermWeight { fn for_each_pruning( &self, threshold: Score, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score) -> Score, ) -> crate::Result<()> { let specialized_scorer = self.specialized_scorer(reader, 1.0)?; match specialized_scorer { TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { - crate::query::boolean_query::block_wand_single_scorer( - *term_scorer, - threshold, - callback, - ); + reader.for_each_pruning(threshold, term_scorer.into_boxed_scorer(), callback); } TermOrEmptyOrAllScorer::Empty => {} TermOrEmptyOrAllScorer::AllMatch(_) => { @@ -166,19 +198,25 @@ impl TermWeight { #[cfg(test)] pub(crate) fn term_scorer_for_test( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, - ) -> crate::Result> { - let scorer = self.specialized_scorer(reader, boost)?; - Ok(match scorer { - TermOrEmptyOrAllScorer::TermScorer(scorer) => Some(*scorer), + ) -> Option { + let scorer = self.specialized_scorer(reader, boost).unwrap(); + match scorer { + TermOrEmptyOrAllScorer::TermScorer(term_scorer) => { + let term_scorer = term_scorer + .into_boxed_scorer() + .downcast::() + .ok()?; + Some(*term_scorer) + } _ => None, - }) + } } fn specialized_scorer( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, boost: Score, ) -> crate::Result { let field = self.term.field(); @@ -196,22 +234,25 @@ impl TermWeight { ))); } - let segment_postings: SegmentPostings = - inverted_index.read_postings_from_terminfo(&term_info, self.index_record_option)?; - let fieldnorm_reader = self.fieldnorm_reader(reader)?; let similarity_weight = self.similarity_weight.boost_by(boost); - Ok(TermOrEmptyOrAllScorer::TermScorer(Box::new( - TermScorer::new(segment_postings, fieldnorm_reader, similarity_weight), - ))) + let mut term_scorer_builder = BuildTermScorer { + term_info: &term_info, + option: self.index_record_option, + fieldnorm_reader, + similarity_weight, + }; + let term_scorer = try_downcast_and_call(inverted_index.as_ref(), &mut term_scorer_builder)?; + + Ok(TermOrEmptyOrAllScorer::TermScorer(term_scorer)) } - fn fieldnorm_reader(&self, segment_reader: &SegmentReader) -> crate::Result { + fn fieldnorm_reader( + &self, + segment_reader: &dyn SegmentReader, + ) -> crate::Result { if self.scoring_enabled { - if let Some(field_norm_reader) = segment_reader - .fieldnorms_readers() - .get_field(self.term.field())? - { + if let Ok(field_norm_reader) = segment_reader.get_fieldnorms_reader(self.term.field()) { return Ok(field_norm_reader); } } diff --git a/src/query/union/bitset_union.rs b/src/query/union/bitset_union.rs index 8af1703ee..a7a326a98 100644 --- a/src/query/union/bitset_union.rs +++ b/src/query/union/bitset_union.rs @@ -1,7 +1,7 @@ use std::cell::RefCell; use crate::docset::DocSet; -use crate::postings::Postings; +use crate::postings::{DocFreq, Postings}; use crate::query::BitSetDocSet; use crate::DocId; @@ -16,6 +16,9 @@ pub struct BitSetPostingUnion { docsets: RefCell>, /// The already unionized BitSet of the docsets bitset: BitSetDocSet, + /// The total number of documents in the union (regardless of the position we are in the + /// bitset). + doc_freq: u32, } impl BitSetPostingUnion { @@ -23,9 +26,11 @@ impl BitSetPostingUnion { docsets: Vec, bitset: BitSetDocSet, ) -> BitSetPostingUnion { + let doc_freq = bitset.doc_freq(); BitSetPostingUnion { docsets: RefCell::new(docsets), bitset, + doc_freq, } } } @@ -46,6 +51,10 @@ impl Postings for BitSetPostingUnion { term_freq } + fn has_freq(&self) -> bool { + true + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { let curr_doc = self.bitset.doc(); let mut docsets = self.docsets.borrow_mut(); @@ -64,6 +73,10 @@ impl Postings for BitSetPostingUnion { output.sort_unstable(); output.dedup(); } + + fn doc_freq(&self) -> DocFreq { + DocFreq::Exact(self.doc_freq) + } } impl DocSet for BitSetPostingUnion { diff --git a/src/query/union/buffered_union.rs b/src/query/union/buffered_union.rs index e4cfe0ba3..b64612e94 100644 --- a/src/query/union/buffered_union.rs +++ b/src/query/union/buffered_union.rs @@ -31,7 +31,7 @@ where P: FnMut(&mut T) -> bool { /// Creates a `DocSet` that iterate through the union of two or more `DocSet`s. pub struct BufferedUnionScorer { /// Active scorers (already filtered of `TERMINATED`). - docsets: Vec, + scorers: Vec, /// Sliding window presence map for upcoming docs. /// /// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering @@ -46,6 +46,8 @@ pub struct BufferedUnionScorer { /// hit the same doc within the buffered window. scores: Box<[TScoreCombiner; HORIZON as usize]>, /// Start doc ID (inclusive) of the current sliding window. + /// None if the window is not loaded yet. This is true for a freshly created + /// BufferedUnionScorer. window_start_doc: DocId, /// Current doc ID of the union. doc: DocId, @@ -81,51 +83,81 @@ fn refill( } impl BufferedUnionScorer { + /// Returns the underlying scorers in the union. + pub fn into_scorers(self) -> Vec { + self.scorers + } + + /// Accessor for the underlying scorers in the union. + pub fn scorers(&self) -> &[TScorer] { + &self.scorers[..] + } + /// num_docs is the number of documents in the segment. pub(crate) fn build( docsets: Vec, score_combiner_fn: impl FnOnce() -> TScoreCombiner, num_docs: u32, ) -> BufferedUnionScorer { - let non_empty_docsets: Vec = docsets + let score_combiner = score_combiner_fn(); + let mut non_empty_docsets: Vec = docsets .into_iter() .filter(|docset| docset.doc() != TERMINATED) .collect(); - let mut union = BufferedUnionScorer { - docsets: non_empty_docsets, - bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), - scores: Box::new([score_combiner_fn(); HORIZON as usize]), - bucket_idx: HORIZON_NUM_TINYBITSETS, - window_start_doc: 0, - doc: 0, - score: 0.0, - num_docs, - }; - if union.refill() { - union.advance(); - } else { - union.doc = TERMINATED; + + let first_doc: DocId = non_empty_docsets + .iter() + .map(|docset| docset.doc()) + .min() + .unwrap_or(TERMINATED); + let mut score_combiner_cloned = score_combiner; + let mut i = 0; + while i < non_empty_docsets.len() { + let should_remove_docset: bool = { + let non_empty_docset = &mut non_empty_docsets[i]; + if non_empty_docset.doc() != first_doc { + false + } else { + score_combiner_cloned.update(non_empty_docset); + non_empty_docsets[i].advance() == TERMINATED + } + }; + if should_remove_docset { + non_empty_docsets.swap_remove(i); + } else { + i += 1; + } + } + let first_score: Score = score_combiner_cloned.score(); + BufferedUnionScorer { + scorers: non_empty_docsets, + bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), + scores: Box::new([score_combiner; HORIZON as usize]), + bucket_idx: HORIZON_NUM_TINYBITSETS, + // That way we will be detected as outside the window, + window_start_doc: u32::MAX - HORIZON, + doc: first_doc, + score: first_score, + num_docs, } - union } fn refill(&mut self) -> bool { - if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() { - // Reset the sliding window to start at the smallest doc - // across all scorers and prebuffer within the horizon. - self.window_start_doc = min_doc; - self.bucket_idx = 0; - self.doc = min_doc; - refill( - &mut self.docsets, - &mut self.bitsets, - &mut self.scores, - min_doc, - ); - true - } else { - false - } + let Some(min_doc) = self.scorers.iter().map(DocSet::doc).min() else { + return false; + }; + // Reset the sliding window to start at the smallest doc + // across all scorers and prebuffer within the horizon. + self.window_start_doc = min_doc; + self.bucket_idx = 0; + self.doc = min_doc; + refill( + &mut self.scorers, + &mut self.bitsets, + &mut self.scores, + min_doc, + ); + true } #[inline] @@ -147,6 +179,7 @@ impl BufferedUnionScorer bool { // wrapping_sub, because target may be < window_start_doc + // in particular during initialization. let gap = target.wrapping_sub(self.window_start_doc); gap < HORIZON } @@ -216,11 +249,10 @@ where if self.doc >= target { return self.doc; } - let gap = target - self.window_start_doc; - if gap < HORIZON { + if self.is_in_horizon(target) { // Our value is within the buffered horizon. - // Skipping to corresponding bucket. + let gap = target.wrapping_sub(self.window_start_doc); let new_bucket_idx = gap as usize / 64; for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] { obsolete_tinyset.clear(); @@ -239,16 +271,14 @@ where doc } else { // clear the buffered info. - for obsolete_tinyset in self.bitsets.iter_mut() { - *obsolete_tinyset = TinySet::empty(); - } + self.bitsets.fill(TinySet::empty()); for score_combiner in self.scores.iter_mut() { score_combiner.clear(); } // The target is outside of the buffered horizon. // advance all docsets to a doc >= to the target. - unordered_drain_filter(&mut self.docsets, |docset| { + unordered_drain_filter(&mut self.scorers, |docset| { if docset.doc() < target { docset.seek(target); } @@ -285,7 +315,7 @@ where let mut is_hit = false; let mut min_new_target = TERMINATED; - for docset in self.docsets.iter_mut() { + for docset in self.scorers.iter_mut() { match docset.seek_danger(target) { SeekDangerResult::Found => { is_hit = true; @@ -315,11 +345,11 @@ where } fn size_hint(&self) -> u32 { - estimate_union(self.docsets.iter().map(DocSet::size_hint), self.num_docs) + estimate_union(self.scorers.iter().map(DocSet::size_hint), self.num_docs) } fn cost(&self) -> u64 { - self.docsets.iter().map(|docset| docset.cost()).sum() + self.scorers.iter().map(|docset| docset.cost()).sum() } // TODO Also implement `count` with deletes efficiently. @@ -327,21 +357,17 @@ where if self.doc == TERMINATED { return 0; } - let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS] + let mut count = 1 + self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS] .iter() - .map(|bitset| bitset.len()) - .sum::() - + 1; - for bitset in self.bitsets.iter_mut() { - bitset.clear(); - } + .copied() + .map(TinySet::len) + .sum::(); while self.refill() { - count += self.bitsets.iter().map(|bitset| bitset.len()).sum::(); - for bitset in self.bitsets.iter_mut() { - bitset.clear(); - } + count += self.bitsets.iter().copied().map(TinySet::len).sum::(); + self.bitsets.fill(TinySet::empty()); } self.bucket_idx = HORIZON_NUM_TINYBITSETS; + self.doc = TERMINATED; count } } diff --git a/src/query/union/simple_union.rs b/src/query/union/simple_union.rs index b153a7f22..0457cae43 100644 --- a/src/query/union/simple_union.rs +++ b/src/query/union/simple_union.rs @@ -1,5 +1,5 @@ use crate::docset::{DocSet, TERMINATED}; -use crate::postings::Postings; +use crate::postings::{DocFreq, Postings}; use crate::DocId; /// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`. @@ -12,7 +12,11 @@ pub struct SimpleUnion { } impl SimpleUnion { - pub(crate) fn build(mut docsets: Vec) -> SimpleUnion { + /// Builds a `SimpleUnion` from multiple docsets. + /// + /// Exhausted docsets are filtered out, and the union is initialized at the + /// smallest current doc id across remaining docsets. + pub fn build(mut docsets: Vec) -> SimpleUnion { docsets.retain(|docset| docset.doc() != TERMINATED); let mut docset = SimpleUnion { docsets, doc: 0 }; @@ -56,6 +60,22 @@ impl Postings for SimpleUnion { term_freq } + fn has_freq(&self) -> bool { + true + } + + /// We do not know the actual document frequency, so we return + /// the maximum document frequency of the docsets. + fn doc_freq(&self) -> DocFreq { + let approximate_doc_freq = self + .docsets + .iter() + .map(|docset| u32::from(docset.doc_freq())) + .max() + .unwrap_or(0u32); + DocFreq::Approximate(approximate_doc_freq) + } + fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec) { for docset in &mut self.docsets { let doc = docset.doc(); diff --git a/src/query/weight.rs b/src/query/weight.rs index 23ff55c04..2ad2d822e 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,21 +1,9 @@ use super::Scorer; use crate::docset::COLLECT_BLOCK_BUFFER_LEN; use crate::index::SegmentReader; +use crate::query::explanation::does_not_match; use crate::query::Explanation; -use crate::{DocId, DocSet, Score, TERMINATED}; - -/// Iterates through all of the documents and scores matched by the DocSet -/// `DocSet`. -pub(crate) fn for_each_scorer( - scorer: &mut TScorer, - callback: &mut dyn FnMut(DocId, Score), -) { - let mut doc = scorer.doc(); - while doc != TERMINATED { - callback(doc, scorer.score()); - doc = scorer.advance(); - } -} +use crate::{DocId, DocSet, Score}; /// Iterates through all of the documents matched by the DocSet /// `DocSet`. @@ -34,31 +22,6 @@ pub(crate) fn for_each_docset_buffered( } } -/// Calls `callback` with all of the `(doc, score)` for which score -/// is exceeding a given threshold. -/// -/// This method is useful for the [`TopDocs`](crate::collector::TopDocs) collector. -/// For all docsets, the blanket implementation has the benefit -/// of prefiltering (doc, score) pairs, avoiding the -/// virtual dispatch cost. -/// -/// More importantly, it makes it possible for scorers to implement -/// important optimization (e.g. BlockWAND for union). -pub(crate) fn for_each_pruning_scorer( - scorer: &mut TScorer, - mut threshold: Score, - callback: &mut dyn FnMut(DocId, Score) -> Score, -) { - let mut doc = scorer.doc(); - while doc != TERMINATED { - let score = scorer.score(); - if score > threshold { - threshold = callback(doc, score); - } - doc = scorer.advance(); - } -} - /// A Weight is the specialization of a `Query` /// for a given set of segments. /// @@ -69,13 +32,19 @@ pub trait Weight: Send + Sync + 'static { /// `boost` is a multiplier to apply to the score. /// /// See [`Query`](crate::query::Query). - fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result>; + fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result>; /// Returns an [`Explanation`] for the given document. - fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result; + fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result { + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.doc() > doc || scorer.seek(doc) != doc { + return Err(does_not_match(doc)); + } + Ok(scorer.explain()) + } /// Returns the number documents within the given [`SegmentReader`]. - fn count(&self, reader: &SegmentReader) -> crate::Result { + fn count(&self, reader: &dyn SegmentReader) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; if let Some(alive_bitset) = reader.alive_bitset() { Ok(scorer.count(alive_bitset)) @@ -88,11 +57,11 @@ pub trait Weight: Send + Sync + 'static { /// `DocSet` and push the scored documents to the collector. fn for_each( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_scorer(scorer.as_mut(), callback); + scorer.for_each(callback); Ok(()) } @@ -100,7 +69,7 @@ pub trait Weight: Send + Sync + 'static { /// `DocSet` and push the scored documents to the collector. fn for_each_no_score( &self, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(&[DocId]), ) -> crate::Result<()> { let mut docset = self.scorer(reader, 1.0)?; @@ -123,11 +92,11 @@ pub trait Weight: Send + Sync + 'static { fn for_each_pruning( &self, threshold: Score, - reader: &SegmentReader, + reader: &dyn SegmentReader, callback: &mut dyn FnMut(DocId, Score) -> Score, ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_pruning_scorer(scorer.as_mut(), threshold, callback); + scorer.for_each_pruning(threshold, callback); Ok(()) } } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 157e237d8..170653724 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -7,7 +7,7 @@ use arc_swap::ArcSwap; pub use warming::Warmer; use self::warming::WarmingState; -use crate::core::searcher::{SearcherGeneration, SearcherInner}; +use crate::core::searcher::{SearcherContext, SearcherGeneration, SearcherInner}; use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK}; use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject}; @@ -189,19 +189,28 @@ impl InnerIndexReader { /// /// This function acquires a lock to prevent GC from removing files /// as we are opening our index. - fn open_segment_readers(index: &Index) -> crate::Result> { + fn open_segment_readers(index: &Index) -> crate::Result>> { // Prevents segment files from getting deleted while we are in the process of opening them let _meta_lock = index.directory().acquire_lock(&META_LOCK)?; let searchable_segments = index.searchable_segments()?; let segment_readers = searchable_segments .iter() - .map(SegmentReader::open) + .map(|segment| { + let reader = + crate::TantivySegmentReader::open_with_custom_alive_set_from_directory( + segment.index().directory(), + segment.meta(), + segment.schema(), + None, + )?; + Ok(Arc::new(reader) as Arc) + }) .collect::>()?; Ok(segment_readers) } fn track_segment_readers_in_inventory( - segment_readers: &[SegmentReader], + segment_readers: &[Arc], searcher_generation_counter: &Arc, searcher_generation_inventory: &Inventory, ) -> TrackedObject { @@ -225,10 +234,9 @@ impl InnerIndexReader { searcher_generation_inventory, ); - let schema = index.schema(); + let context = SearcherContext::from_index(index); let searcher = Arc::new(SearcherInner::new( - schema, - index.clone(), + context, segment_readers, searcher_generation, doc_store_cache_num_blocks, diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index 915b685aa..c66dafe24 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -755,10 +755,9 @@ mod tests { doc.add_object(json_field, json_val); let schema = schema_builder.build(); - let json = doc.to_json(&schema); - let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap(); + let actual_json = doc.to_json(&schema); let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap(); - assert_eq!(actual_json["json"][0], expected_json); + assert_eq!(actual_json["json"], expected_json); } // TODO: Should this be re-added with the serialize method diff --git a/src/schema/document/mod.rs b/src/schema/document/mod.rs index 8168ee811..eeab563ac 100644 --- a/src/schema/document/mod.rs +++ b/src/schema/document/mod.rs @@ -247,10 +247,35 @@ pub trait Document: Send + Sync + 'static { /// Encode the doc in JSON. /// /// Encoding a document cannot fail. - fn to_json(&self, schema: &Schema) -> String { + fn to_serialized_json(&self, schema: &Schema) -> String { serde_json::to_string(&self.to_named_doc(schema)) .expect("doc encoding failed. This is a bug") } + + /// Encode the doc in JSON. + /// + /// Encoding a document cannot fail. + /// + /// It will automatically flatten arrays of length 1 to just the value, and it will + /// automatically flatten objects of length 1 to just the value. + fn to_json(&self, schema: &Schema) -> serde_json::Value { + let mut json_value = serde_json::Value::Object(serde_json::Map::new()); + for (field, field_values) in self.get_sorted_field_values() { + let field_name = schema.get_field_name(field); + let values: Vec = field_values + .into_iter() + .map(|val| OwnedValue::from(val.as_value())) + .collect(); + if values.len() == 1 { + json_value[field_name] = + serde_json::to_value(&values[0]).expect("doc encoding failed. This is a bug"); + } else { + json_value[field_name] = + serde_json::to_value(&values).expect("doc encoding failed. This is a bug"); + } + } + json_value + } } pub(crate) mod type_codes { diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 49a6b1ac7..f4f95bafa 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -475,8 +475,11 @@ mod tests { let schema = schema_builder.build(); let mut doc = TantivyDocument::default(); doc.add_bytes(bytes_field, "this is a test".as_bytes()); - let json_string = doc.to_json(&schema); - assert_eq!(json_string, r#"{"my_bytes":["dGhpcyBpcyBhIHRlc3Q="]}"#); + let json_value = doc.to_json(&schema); + assert_eq!( + json_value, + serde_json::json!({"my_bytes": "dGhpcyBpcyBhIHRlc3Q="}) + ); } #[test] @@ -487,9 +490,8 @@ mod tests { let schema = schema_builder.build(); let mut doc = TantivyDocument::default(); doc.add_bytes(bytes_field, "".as_bytes()); - let json_string = doc.to_json(&schema); - - assert_eq!(json_string, r#"{"my_bytes":[""]}"#); + let json_value = doc.to_json(&schema); + assert_eq!(json_value, serde_json::json!({"my_bytes": ""})); } #[test] @@ -503,10 +505,12 @@ mod tests { bytes_field, "A bigger test I guess\nspanning on multiple lines\nhoping this will work".as_bytes(), ); - let json_string = doc.to_json(&schema); + let json_value = doc.to_json(&schema); assert_eq!( - json_string, - r#"{"my_bytes":["QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw=="]}"# + json_value, + serde_json::json!({ + "my_bytes": "QSBiaWdnZXIgdGVzdCBJIGd1ZXNzCnNwYW5uaW5nIG9uIG11bHRpcGxlIGxpbmVzCmhvcGluZyB0aGlzIHdpbGwgd29yaw==" + }) ); } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 5c54c956c..8e1251b0f 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -702,7 +702,10 @@ mod tests { let date_time = PrimitiveDateTime::new(naive_date, naive_time); doc.add_date(date_field, DateTime::from_primitive(date_time)); let doc_json = doc.to_json(&schema); - assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#); + assert_eq!( + doc_json, + serde_json::json!({"date": "1982-09-17T13:20:00Z"}) + ); } #[test] diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index d3adf85a3..8b39dde64 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; /// * describe in the schema the amount of information that should be retained during indexing (See /// [`TextFieldIndexing::set_index_option()`](crate::schema::TextFieldIndexing::set_index_option)) /// * request that a given amount of information to be decoded as one goes through a posting list. -/// (See [`InvertedIndexReader::read_postings()`](crate::InvertedIndexReader::read_postings)) +/// (See [`DynInvertedIndexReader::read_postings()`](crate::DynInvertedIndexReader::read_postings)) #[derive( Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize, Default, )] diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c1d22c0ba..c9468f075 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -590,7 +590,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let doc_serdeser = TantivyDocument::parse_json(&schema, &doc.to_json(&schema)).unwrap(); + let doc_serdeser = + TantivyDocument::parse_json(&schema, &doc.to_serialized_json(&schema)).unwrap(); assert_eq!(doc, doc_serdeser); } @@ -605,8 +606,8 @@ mod tests { "ip": "127.0.0.1" }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "127.0.0.1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "127.0.0.1"); // Special case IpV6 loopback. We don't want to map that to IPv4 let doc_json = r#"{ @@ -614,8 +615,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "::1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "::1"); // testing ip address of every router in the world let doc_json = r#"{ @@ -623,8 +624,8 @@ mod tests { }"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let value: serde_json::Value = serde_json::from_str(&doc.to_json(&schema)).unwrap(); - assert_eq!(value["ip"][0], "192.168.0.1"); + let value = doc.to_json(&schema); + assert_eq!(value["ip"], "192.168.0.1"); } #[test] diff --git a/src/schema/term.rs b/src/schema/term.rs index e1e4f02e4..24a53844b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -135,7 +135,7 @@ impl Term { /// Use `clear_with_field_and_type` in that case. /// /// Sets field and the type. - pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) { + pub fn set_field_and_type(&mut self, field: Field, typ: Type) { assert!(self.is_empty()); self.field = field; self.serialized_value_bytes[0] = typ.to_code(); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index ee61b534a..4b5867bc9 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -422,7 +422,7 @@ impl SnippetGenerator { terms_text.insert(term_str.to_string(), score); } } - let tokenizer = searcher.index().tokenizer_for_field(field)?; + let tokenizer = searcher.tokenizer_for_field(field)?; Ok(SnippetGenerator { terms_text, tokenizer, diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 4c1f9c76a..3bd3e70c4 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -34,7 +34,8 @@ pub struct SearcherSpaceUsage { } impl SearcherSpaceUsage { - pub(crate) fn new() -> SearcherSpaceUsage { + /// Creates an empty searcher space-usage accumulator. + pub fn new() -> SearcherSpaceUsage { SearcherSpaceUsage { segments: Vec::new(), total: Default::default(), @@ -80,7 +81,8 @@ pub struct SegmentSpaceUsage { impl SegmentSpaceUsage { #[expect(clippy::too_many_arguments)] - pub(crate) fn new( + /// Creates a segment space-usage summary from all major segment components. + pub fn new( num_docs: u32, termdict: PerFieldSpaceUsage, postings: PerFieldSpaceUsage, @@ -210,7 +212,7 @@ impl StoreSpaceUsage { /// /// A field can appear with a single index (typically 0) or with multiple indexes. /// Multiple indexes are used to handle variable length things, where -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, Default)] pub struct PerFieldSpaceUsage { fields: BTreeMap, total: ByteCount, diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 13c252e92..80e15b8e9 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -48,7 +48,7 @@ mod tests { use crate::indexer::NoMergePolicy; use crate::schema::{SchemaBuilder, STORED, TEXT}; use crate::store::index::Checkpoint; - use crate::{DocAddress, DocId, Index, IndexWriter, TantivyDocument, Term}; + use crate::{DocAddress, DocId, Index, IndexWriter, Term}; #[test] fn test_skip_index_empty() -> io::Result<()> { @@ -149,7 +149,7 @@ mod tests { let searcher = reader.searcher(); assert_eq!(searcher.num_docs(), 30); for i in 0..searcher.num_docs() as u32 { - let _doc = searcher.doc::(DocAddress::new(0u32, i))?; + let _doc = searcher.doc(DocAddress::new(0u32, i))?; } Ok(()) } diff --git a/src/store/mod.rs b/src/store/mod.rs index cccf4d8f9..15dafb926 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -22,7 +22,7 @@ //! //! # Usage //! -//! Most users should not access the `StoreReader` directly +//! Most users should not access the `TantivyStoreReader` directly //! and should rely on either //! //! - at the segment level, the [`SegmentReader`'s `doc` @@ -38,7 +38,7 @@ mod writer; pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::decompressors::Decompressor; -pub use self::reader::{CacheStats, StoreReader}; +pub use self::reader::{CacheStats, StoreReader, TantivyStoreReader}; pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY}; pub use self::writer::StoreWriter; mod store_compressor; @@ -117,11 +117,11 @@ pub(crate) mod tests { write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::default(), BLOCK_SIZE, true); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, 10)?; + let store = TantivyStoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( store - .get::(i)? + .get(i)? .get_first(field_title) .unwrap() .as_value() @@ -169,11 +169,11 @@ pub(crate) mod tests { write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize, separate_thread); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, 10)?; + let store = TantivyStoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( *store - .get::(i)? + .get(i)? .get_first(field_title) .unwrap() .as_str() @@ -247,9 +247,10 @@ pub(crate) mod tests { let searcher = index.reader()?.searcher(); let reader = searcher.segment_reader(0); let store = reader.get_store_reader(10)?; - for doc in store.iter::(reader.alive_bitset()) { + for doc_id in reader.doc_ids_alive() { + let doc = store.get(doc_id)?; assert_eq!( - *doc?.get_first(text_field).unwrap().as_str().unwrap(), + *doc.get_first(text_field).unwrap().as_str().unwrap(), "deletemenot".to_string() ); } @@ -280,13 +281,6 @@ pub(crate) mod tests { } assert!(index_writer.commit().is_ok()); } - assert_eq!( - index.reader().unwrap().searcher().segment_readers()[0] - .get_store_reader(10) - .unwrap() - .decompressor(), - Decompressor::Lz4 - ); // Change compressor, this disables stacking on merging let index_settings = index.settings_mut(); index_settings.docstore_compression = Compressor::Zstd(Default::default()); @@ -305,17 +299,13 @@ pub(crate) mod tests { let reader = searcher.segment_readers().iter().last().unwrap(); let store = reader.get_store_reader(10).unwrap(); - for doc in store - .iter::(reader.alive_bitset()) - .take(50) - { + for doc_id in reader.doc_ids_alive().take(50) { + let doc = store.get(doc_id)?; assert_eq!( - *doc?.get_first(text_field).and_then(|v| v.as_str()).unwrap(), + *doc.get_first(text_field).and_then(|v| v.as_str()).unwrap(), LOREM.to_string() ); } - assert_eq!(store.decompressor(), Decompressor::Zstd); - Ok(()) } @@ -354,7 +344,12 @@ pub(crate) mod tests { assert_eq!(searcher.segment_readers().len(), 1); let reader = searcher.segment_readers().iter().last().unwrap(); let store = reader.get_store_reader(10)?; - assert_eq!(store.block_checkpoints().count(), 1); + let mut num_docs = 0; + for doc_id in reader.doc_ids_alive() { + store.get(doc_id)?; + num_docs += 1; + } + assert_eq!(num_docs, 5); Ok(()) } } @@ -368,7 +363,7 @@ mod bench { use super::tests::write_lorem_ipsum_store; use crate::directory::{Directory, RamDirectory}; - use crate::store::{Compressor, StoreReader}; + use crate::store::{Compressor, TantivyStoreReader}; use crate::TantivyDocument; #[bench] @@ -400,7 +395,7 @@ mod bench { true, ); let store_file = directory.open_read(path).unwrap(); - let store = StoreReader::open(store_file, 10).unwrap(); + let store = TantivyStoreReader::open(store_file, 10).unwrap(); b.iter(|| store.iter::(None).collect::>()); } } diff --git a/src/store/reader.rs b/src/store/reader.rs index a4105abec..75824d980 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -5,17 +5,20 @@ use std::num::NonZeroUsize; use std::ops::{AddAssign, Range}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; +#[cfg(feature = "quickwit")] +use std::{future::Future, pin::Pin}; use common::{BinarySerializable, OwnedBytes}; use lru::LruCache; use super::footer::DocStoreFooter; use super::index::SkipIndex; -use super::Decompressor; +use super::{Compressor, Decompressor, StoreWriter}; use crate::directory::FileSlice; use crate::error::DataCorruption; use crate::fastfield::AliveBitSet; use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize}; +use crate::schema::TantivyDocument; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; @@ -26,9 +29,33 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; type Block = OwnedBytes; +/// Object-safe API for reading documents from the store. +pub trait StoreReader: Send + Sync { + /// Reads and deserializes a given document. + fn get(&self, doc_id: DocId) -> crate::Result; + + /// Returns the cache hit and miss statistics of this reader. + fn cache_stats(&self) -> CacheStats; + + /// Merges this store into `store_writer`, filtering deletes via `alive_bitset`. + fn merge_into( + &self, + store_writer: &mut StoreWriter, + alive_bitset: Option<&AliveBitSet>, + ) -> crate::Result<()>; + + /// Fetches a document asynchronously. + #[cfg(feature = "quickwit")] + fn get_async<'a>( + &'a self, + doc_id: DocId, + executor: &'a Executor, + ) -> Pin> + 'a>>; +} + /// The format version of the document store. #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] -pub(crate) enum DocStoreVersion { +pub enum DocStoreVersion { V1 = 1, V2 = 2, } @@ -60,7 +87,7 @@ impl BinarySerializable for DocStoreVersion { } /// Reads document off tantivy's [`Store`](./index.html) -pub struct StoreReader { +pub struct TantivyStoreReader { decompressor: Decompressor, doc_store_version: DocStoreVersion, data: FileSlice, @@ -119,7 +146,7 @@ impl BlockCache { } #[derive(Debug, Default)] -/// CacheStats for the `StoreReader`. +/// CacheStats for the `TantivyStoreReader`. pub struct CacheStats { /// The number of entries in the cache pub num_entries: usize, @@ -149,12 +176,12 @@ impl Sum for CacheStats { } } -impl StoreReader { +impl TantivyStoreReader { /// Opens a store reader /// /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU. /// The size of blocks is configurable, this should be reflexted in the - pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result { + pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result { let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?; let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize); @@ -162,7 +189,7 @@ impl StoreReader { let space_usage = StoreSpaceUsage::new(data_file.num_bytes(), offset_index_file.num_bytes()); let skip_index = SkipIndex::open(index_data); - Ok(StoreReader { + Ok(TantivyStoreReader { decompressor: footer.decompressor, doc_store_version: footer.doc_store_version, data: data_file, @@ -177,14 +204,10 @@ impl StoreReader { }) } - pub(crate) fn block_checkpoints(&self) -> impl Iterator + '_ { + fn block_checkpoints(&self) -> impl Iterator + '_ { self.skip_index.checkpoints() } - pub(crate) fn decompressor(&self) -> Decompressor { - self.decompressor - } - /// Returns the cache hit and miss statistics of the store reader. pub(crate) fn cache_stats(&self) -> CacheStats { self.cache.stats() @@ -204,6 +227,26 @@ impl StoreReader { self.data.read_bytes() } + fn can_stack_for_merge(&self, target_compressor: Compressor) -> bool { + // If there is not enough data in the store, we avoid stacking in order to + // avoid creating many small blocks in the doc store. + // https://github.com/quickwit-oss/tantivy/issues/1053 + const MIN_BLOCKS_REQUIRED: usize = 6; + self.decompressor == target_compressor.into() + && self + .block_checkpoints() + // to not count all blocks in the store + .take(MIN_BLOCKS_REQUIRED + 1) + .count() + >= MIN_BLOCKS_REQUIRED + } + + fn block_ranges(&self) -> Vec<(Range, Range)> { + self.block_checkpoints() + .map(|checkpoint| (checkpoint.doc_range, checkpoint.byte_range)) + .collect() + } + fn get_compressed_block(&self, checkpoint: &Checkpoint) -> io::Result { self.data.slice(checkpoint.byte_range.clone()).read_bytes() } @@ -236,25 +279,15 @@ impl StoreReader { /// /// It should not be called to score documents /// for instance. - pub fn get(&self, doc_id: DocId) -> crate::Result { - let mut doc_bytes = self.get_document_bytes(doc_id)?; + pub fn get(&self, doc_id: DocId) -> crate::Result { + let checkpoint = self.block_checkpoint(doc_id)?; + let block = self.read_block(&checkpoint)?; + let mut doc_bytes = Self::get_document_bytes_from_block(block, doc_id, &checkpoint)?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) .map_err(crate::TantivyError::from)?; - D::deserialize(deserializer).map_err(crate::TantivyError::from) - } - - /// Returns raw bytes of a given document. - /// - /// Calling `.get(doc)` is relatively costly as it requires - /// decompressing a compressed block. The store utilizes a LRU cache, - /// so accessing docs from the same compressed block should be faster. - /// For that reason a store reader should be kept and reused. - pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result { - let checkpoint = self.block_checkpoint(doc_id)?; - let block = self.read_block(&checkpoint)?; - Self::get_document_bytes_from_block(block, doc_id, &checkpoint) + TantivyDocument::deserialize(deserializer).map_err(crate::TantivyError::from) } /// Advanced API. @@ -354,6 +387,44 @@ impl StoreReader { } } +impl StoreReader for TantivyStoreReader { + fn get(&self, doc_id: DocId) -> crate::Result { + TantivyStoreReader::get(self, doc_id) + } + + fn cache_stats(&self) -> CacheStats { + TantivyStoreReader::cache_stats(self) + } + + fn merge_into( + &self, + store_writer: &mut StoreWriter, + alive_bitset: Option<&AliveBitSet>, + ) -> crate::Result<()> { + if alive_bitset.is_some() || !self.can_stack_for_merge(store_writer.compressor()) { + for doc_bytes_res in self.iter_raw(alive_bitset) { + let doc_bytes = doc_bytes_res?; + store_writer.store_bytes(&doc_bytes)?; + } + Ok(()) + } else { + let block_data = self.block_data()?; + let block_ranges = self.block_ranges(); + store_writer.stack_parts(block_data, block_ranges)?; + Ok(()) + } + } + + #[cfg(feature = "quickwit")] + fn get_async<'a>( + &'a self, + doc_id: DocId, + executor: &'a Executor, + ) -> Pin> + 'a>> { + Box::pin(TantivyStoreReader::get_async(self, doc_id, executor)) + } +} + fn block_read_index(block: &[u8], doc_pos: u32) -> crate::Result> { let doc_pos = doc_pos as usize; let size_of_u32 = std::mem::size_of::(); @@ -377,7 +448,7 @@ fn block_read_index(block: &[u8], doc_pos: u32) -> crate::Result> { } #[cfg(feature = "quickwit")] -impl StoreReader { +impl TantivyStoreReader { /// Advanced API. /// /// In most cases use [`get_async`](Self::get_async) @@ -413,7 +484,7 @@ impl StoreReader { } /// Reads raw bytes of a given document asynchronously. - pub async fn get_document_bytes_async( + async fn get_document_bytes_async( &self, doc_id: DocId, executor: &Executor, @@ -424,17 +495,17 @@ impl StoreReader { } /// Fetches a document asynchronously. Async version of [`get`](Self::get). - pub async fn get_async( + pub async fn get_async( &self, doc_id: DocId, executor: &Executor, - ) -> crate::Result { + ) -> crate::Result { let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version) .map_err(crate::TantivyError::from)?; - D::deserialize(deserializer).map_err(crate::TantivyError::from) + TantivyDocument::deserialize(deserializer).map_err(crate::TantivyError::from) } } @@ -468,7 +539,7 @@ mod tests { let schema = write_lorem_ipsum_store(writer, 500, Compressor::None, BLOCK_SIZE, true); let title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file, DOCSTORE_CACHE_CAPACITY)?; + let store = TantivyStoreReader::open(store_file, DOCSTORE_CACHE_CAPACITY)?; assert_eq!(store.cache.len(), 0); assert_eq!(store.cache_stats().cache_hits, 0); diff --git a/src/store/store_compressor.rs b/src/store/store_compressor.rs index 20211b25a..10f78f1ea 100644 --- a/src/store/store_compressor.rs +++ b/src/store/store_compressor.rs @@ -1,15 +1,16 @@ use std::io::Write; +use std::ops::Range; use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; use std::thread::JoinHandle; use std::{io, thread}; -use common::{BinarySerializable, CountingWriter, TerminatingWrite}; +use common::{BinarySerializable, CountingWriter, OwnedBytes, TerminatingWrite}; use super::DOC_STORE_VERSION; use crate::directory::WritePtr; use crate::store::footer::DocStoreFooter; use crate::store::index::{Checkpoint, SkipIndexBuilder}; -use crate::store::{Compressor, Decompressor, StoreReader}; +use crate::store::{Compressor, Decompressor}; use crate::DocId; pub struct BlockCompressor(BlockCompressorVariants); @@ -54,16 +55,19 @@ impl BlockCompressor { Ok(()) } - pub fn stack_reader(&mut self, store_reader: StoreReader) -> io::Result<()> { + pub fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { match &mut self.0 { BlockCompressorVariants::SameThread(block_compressor) => { - block_compressor.stack(store_reader)?; + block_compressor.stack_parts(block_data, block_ranges) } BlockCompressorVariants::DedicatedThread(different_thread_block_compressor) => { - different_thread_block_compressor.stack_reader(store_reader)?; + different_thread_block_compressor.stack_parts(block_data, block_ranges) } } - Ok(()) } pub fn close(self) -> io::Result<()> { @@ -122,22 +126,24 @@ impl BlockCompressorImpl { /// This method is an optimization compared to iterating over the documents /// in the store and adding them one by one, as the store's data will /// not be decompressed and then recompressed. - fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { + fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { let doc_shift = self.first_doc_in_block; let start_shift = self.writer.written_bytes() as usize; // just bulk write all of the block of the given reader. - self.writer - .write_all(store_reader.block_data()?.as_slice())?; + self.writer.write_all(block_data.as_slice())?; // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. - for mut checkpoint in store_reader.block_checkpoints() { - checkpoint.doc_range.start += doc_shift; - checkpoint.doc_range.end += doc_shift; - checkpoint.byte_range.start += start_shift; - checkpoint.byte_range.end += start_shift; - self.register_checkpoint(checkpoint); + for (doc_range, byte_range) in block_ranges { + self.register_checkpoint(Checkpoint { + doc_range: (doc_range.start + doc_shift)..(doc_range.end + doc_shift), + byte_range: (byte_range.start + start_shift)..(byte_range.end + start_shift), + }); } Ok(()) } @@ -161,7 +167,10 @@ enum BlockCompressorMessage { block_data: Vec, num_docs_in_block: u32, }, - Stack(StoreReader), + Stack { + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + }, } struct DedicatedThreadBlockCompressorImpl { @@ -187,8 +196,11 @@ impl DedicatedThreadBlockCompressorImpl { block_compressor .compress_block_and_write(&block_data[..], num_docs_in_block)?; } - BlockCompressorMessage::Stack(store_reader) => { - block_compressor.stack(store_reader)?; + BlockCompressorMessage::Stack { + block_data, + block_ranges, + } => { + block_compressor.stack_parts(block_data, block_ranges)?; } } } @@ -208,8 +220,15 @@ impl DedicatedThreadBlockCompressorImpl { }) } - fn stack_reader(&mut self, store_reader: StoreReader) -> io::Result<()> { - self.send(BlockCompressorMessage::Stack(store_reader)) + fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { + self.send(BlockCompressorMessage::Stack { + block_data, + block_ranges, + }) } fn send(&mut self, msg: BlockCompressorMessage) -> io::Result<()> { diff --git a/src/store/writer.rs b/src/store/writer.rs index ef514accc..bb4be2484 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,10 +1,12 @@ use std::io; +use std::ops::Range; +use std::sync::Arc; -use common::BinarySerializable; +use common::{BinarySerializable, OwnedBytes}; use super::compressors::Compressor; -use super::StoreReader; use crate::directory::WritePtr; +use crate::index::SegmentReader; use crate::schema::document::{BinaryDocumentSerializer, Document}; use crate::schema::Schema; use crate::store::store_compressor::BlockCompressor; @@ -119,14 +121,24 @@ impl StoreWriter { Ok(()) } - /// Stacks a store reader on top of the documents written so far. - /// This method is an optimization compared to iterating over the documents - /// in the store and adding them one by one, as the store's data will - /// not be decompressed and then recompressed. - pub fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { - // We flush the current block first before stacking + pub(crate) fn stack_parts( + &mut self, + block_data: OwnedBytes, + block_ranges: Vec<(Range, Range)>, + ) -> io::Result<()> { self.send_current_block_to_compressor()?; - self.block_compressor.stack_reader(store_reader)?; + self.block_compressor.stack_parts(block_data, block_ranges) + } + + pub(crate) fn merge_segment_readers( + &mut self, + segment_readers: &[Arc], + ) -> crate::Result<()> { + const MERGE_DOCSTORE_CACHE_NUM_BLOCKS: usize = 1; + for segment_reader in segment_readers { + let store_reader = segment_reader.get_store_reader(MERGE_DOCSTORE_CACHE_NUM_BLOCKS)?; + store_reader.merge_into(self, segment_reader.alive_bitset())?; + } Ok(()) }