Compare commits

..

6 Commits

Author SHA1 Message Date
trinity-1686a
9b619998bd Merge pull request #2816 from evance-br/fix-closing-paren-elastic-range 2026-01-27 17:00:08 +01:00
Evance Soumaoro
765c448945 uncomment commented code when testing 2026-01-27 13:19:41 +00:00
Evance Soumaoro
943594ebaa uncomment commented code when testing 2026-01-27 13:08:38 +00:00
Evance Soumaoro
df17daae0d fix closing parenthesis error on elastic range queries for lenient parser 2026-01-27 13:01:14 +00:00
Paul Masurel
0ae94baef5 Remove temp file (#2815)
Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
2026-01-27 09:22:11 +01:00
Paul Masurel
3f448ecf79 Bugfix on intersection. (#2812)
The intersection algorithm made it possible for .seek(..) with values
lower than the current doc id, breaking the DocSet contract.

The fix removes the optimization that caused left.seek(..) to be replaced
by a simpler left.advance(..).

Simply doing so lead to a performance regression.
I therefore integrated that idea within SegmentPostings.seek.

We now attempt to check the next doc systematically on seek,
PROVIDED the block is already loaded.

Closes #2811

Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
2026-01-27 09:21:09 +01:00
66 changed files with 512 additions and 655 deletions

View File

@@ -70,7 +70,7 @@ impl Collector for StatsCollector {
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> tantivy::Result<StatsSegmentCollector> {
let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
Ok(StatsSegmentCollector {

View File

@@ -65,7 +65,7 @@ fn main() -> tantivy::Result<()> {
);
let top_docs_by_custom_score =
// Call TopDocs with a custom tweak score
TopDocs::with_limit(2).tweak_score(move |segment_reader: &dyn SegmentReader| {
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
let facet_dict = ingredient_reader.facet_dict();

View File

@@ -43,7 +43,7 @@ impl DynamicPriceColumn {
}
}
pub fn price_for_segment(&self, segment_reader: &dyn SegmentReader) -> Option<Arc<Vec<Price>>> {
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
self.price_cache.read().unwrap().get(&segment_key).cloned()
}
@@ -157,7 +157,7 @@ fn main() -> tantivy::Result<()> {
let query = query_parser.parse_query("cooking")?;
let searcher = reader.searcher();
let score_by_price = move |segment_reader: &dyn SegmentReader| {
let score_by_price = move |segment_reader: &SegmentReader| {
let price = price_dynamic_column
.price_for_segment(segment_reader)
.unwrap();

View File

@@ -560,7 +560,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(
(
value((), tag(">=")),
map(word_infallible("", false), |(bound, err)| {
map(word_infallible(")", false), |(bound, err)| {
(
(
bound
@@ -574,7 +574,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<=")),
map(word_infallible("", false), |(bound, err)| {
map(word_infallible(")", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -588,7 +588,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag(">")),
map(word_infallible("", false), |(bound, err)| {
map(word_infallible(")", false), |(bound, err)| {
(
(
bound
@@ -602,7 +602,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<")),
map(word_infallible("", false), |(bound, err)| {
map(word_infallible(")", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -1323,6 +1323,14 @@ mod test {
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("(<=42)", "{\"*\" TO \"42\"]");
test_parse_query_to_ast_helper("(<=42 )", "{\"*\" TO \"42\"]");
test_parse_query_to_ast_helper("(age:>5)", "\"age\":{\"5\" TO \"*\"}");
test_parse_query_to_ast_helper(
"(title:bar AND age:>12)",
"(+\"title\":bar +\"age\":{\"12\" TO \"*\"})",
);
}
#[test]

View File

@@ -57,7 +57,7 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
/// Get fast field reader or empty as default.
pub(crate) fn get_ff_reader(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
) -> crate::Result<(columnar::Column<u64>, ColumnType)> {
@@ -74,7 +74,7 @@ pub(crate) fn get_ff_reader(
}
pub(crate) fn get_dynamic_columns(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field_name: &str,
) -> crate::Result<Vec<columnar::DynamicColumn>> {
let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?;
@@ -90,7 +90,7 @@ pub(crate) fn get_dynamic_columns(
///
/// Is guaranteed to return at least one column.
pub(crate) fn get_all_ff_reader_or_empty(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
fallback_type: ColumnType,

View File

@@ -469,7 +469,7 @@ impl AggKind {
/// Build AggregationsData by walking the request tree.
pub(crate) fn build_aggregations_data_from_req(
aggs: &Aggregations,
reader: &dyn SegmentReader,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
context: AggContextParams,
) -> crate::Result<AggregationsSegmentCtx> {
@@ -489,7 +489,7 @@ pub(crate) fn build_aggregations_data_from_req(
fn build_nodes(
agg_name: &str,
req: &Aggregation,
reader: &dyn SegmentReader,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
data: &mut AggregationsSegmentCtx,
is_top_level: bool,
@@ -728,6 +728,7 @@ fn build_nodes(
let idx_in_req_data = data.push_filter_req_data(FilterAggReqData {
name: agg_name.to_string(),
req: filter_req.clone(),
segment_reader: reader.clone(),
evaluator,
matching_docs_buffer,
is_top_level,
@@ -744,7 +745,7 @@ fn build_nodes(
fn build_children(
aggs: &Aggregations,
reader: &dyn SegmentReader,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
data: &mut AggregationsSegmentCtx,
) -> crate::Result<Vec<AggRefNode>> {
@@ -763,7 +764,7 @@ fn build_children(
}
fn get_term_agg_accessors(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field_name: &str,
missing: &Option<Key>,
) -> crate::Result<Vec<(Column<u64>, ColumnType)>> {
@@ -816,7 +817,7 @@ fn build_terms_or_cardinality_nodes(
agg_name: &str,
field_name: &str,
missing: &Option<Key>,
reader: &dyn SegmentReader,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
data: &mut AggregationsSegmentCtx,
sub_aggs: &Aggregations,

View File

@@ -401,6 +401,8 @@ pub struct FilterAggReqData {
pub name: String,
/// The filter aggregation
pub req: FilterAggregation,
/// The segment reader
pub segment_reader: SegmentReader,
/// Document evaluator for the filter query (precomputed BitSet)
/// This is built once when the request data is created
pub evaluator: DocumentQueryEvaluator,
@@ -412,8 +414,9 @@ pub struct FilterAggReqData {
impl FilterAggReqData {
pub(crate) fn get_memory_consumption(&self) -> usize {
// Estimate: name + bitset + buffer capacity
// Estimate: name + segment reader reference + bitset + buffer capacity
self.name.len()
+ std::mem::size_of::<SegmentReader>()
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
+ std::mem::size_of::<bool>()
@@ -435,7 +438,7 @@ impl DocumentQueryEvaluator {
pub(crate) fn new(
query: Box<dyn Query>,
schema: Schema,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<Self> {
let max_doc = segment_reader.max_doc();

View File

@@ -66,7 +66,7 @@ impl Collector for DistributedAggregationCollector {
fn for_segment(
&self,
segment_local_id: crate::SegmentOrdinal,
reader: &dyn SegmentReader,
reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
AggregationSegmentCollector::from_agg_req_and_reader(
&self.agg,
@@ -96,7 +96,7 @@ impl Collector for AggregationCollector {
fn for_segment(
&self,
segment_local_id: crate::SegmentOrdinal,
reader: &dyn SegmentReader,
reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
AggregationSegmentCollector::from_agg_req_and_reader(
&self.agg,
@@ -145,7 +145,7 @@ impl AggregationSegmentCollector {
/// reader. Also includes validation, e.g. checking field types and existence.
pub fn from_agg_req_and_reader(
agg: &Aggregations,
reader: &dyn SegmentReader,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
context: &AggContextParams,
) -> crate::Result<Self> {

View File

@@ -43,7 +43,7 @@ impl Collector for Count {
fn for_segment(
&self,
_: SegmentOrdinal,
_: &dyn SegmentReader,
_: &SegmentReader,
) -> crate::Result<SegmentCountCollector> {
Ok(SegmentCountCollector::default())
}

View File

@@ -1,7 +1,7 @@
use std::collections::HashSet;
use super::{Collector, SegmentCollector};
use crate::{DocAddress, DocId, Score, SegmentReader};
use crate::{DocAddress, DocId, Score};
/// Collectors that returns the set of DocAddress that matches the query.
///
@@ -15,7 +15,7 @@ impl Collector for DocSetCollector {
fn for_segment(
&self,
segment_local_id: crate::SegmentOrdinal,
_segment: &dyn SegmentReader,
_segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
Ok(DocSetChildCollector {
segment_local_id,

View File

@@ -265,7 +265,7 @@ impl Collector for FacetCollector {
fn for_segment(
&self,
_: SegmentOrdinal,
reader: &dyn SegmentReader,
reader: &SegmentReader,
) -> crate::Result<FacetSegmentCollector> {
let facet_reader = reader.facet_reader(&self.field_name)?;
let facet_dict = facet_reader.facet_dict();

View File

@@ -113,7 +113,7 @@ where
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().column_opt(&self.field)?;
@@ -287,7 +287,7 @@ where
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;

View File

@@ -6,7 +6,7 @@ use fastdivide::DividerU64;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::Type;
use crate::{DocId, Score, SegmentReader};
use crate::{DocId, Score};
/// Histogram builds an histogram of the values of a fastfield for the
/// collected DocSet.
@@ -110,7 +110,7 @@ impl Collector for HistogramCollector {
fn for_segment(
&self,
_segment_local_id: crate::SegmentOrdinal,
segment: &dyn SegmentReader,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
let (column, _column_type) = column_opt.ok_or_else(|| FastFieldNotAvailableError {

View File

@@ -156,7 +156,7 @@ pub trait Collector: Sync + Send {
fn for_segment(
&self,
segment_local_id: SegmentOrdinal,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<Self::Child>;
/// Returns true iff the collector requires to compute scores for documents.
@@ -174,7 +174,7 @@ pub trait Collector: Sync + Send {
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &dyn SegmentReader,
reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let with_scoring = self.requires_scoring();
let mut segment_collector = self.for_segment(segment_ord, reader)?;
@@ -186,7 +186,7 @@ pub trait Collector: Sync + Send {
pub(crate) fn default_collect_segment_impl<TSegmentCollector: SegmentCollector>(
segment_collector: &mut TSegmentCollector,
weight: &dyn Weight,
reader: &dyn SegmentReader,
reader: &SegmentReader,
with_scoring: bool,
) -> crate::Result<()> {
match (reader.alive_bitset(), with_scoring) {
@@ -255,7 +255,7 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
fn for_segment(
&self,
segment_local_id: SegmentOrdinal,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
Ok(if let Some(inner) = self {
let inner_segment_collector = inner.for_segment(segment_local_id, segment)?;
@@ -336,7 +336,7 @@ where
fn for_segment(
&self,
segment_local_id: u32,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let left = self.0.for_segment(segment_local_id, segment)?;
let right = self.1.for_segment(segment_local_id, segment)?;
@@ -407,7 +407,7 @@ where
fn for_segment(
&self,
segment_local_id: u32,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let one = self.0.for_segment(segment_local_id, segment)?;
let two = self.1.for_segment(segment_local_id, segment)?;
@@ -487,7 +487,7 @@ where
fn for_segment(
&self,
segment_local_id: u32,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let one = self.0.for_segment(segment_local_id, segment)?;
let two = self.1.for_segment(segment_local_id, segment)?;

View File

@@ -24,7 +24,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
fn for_segment(
&self,
segment_local_id: u32,
reader: &dyn SegmentReader,
reader: &SegmentReader,
) -> crate::Result<Box<dyn BoxableSegmentCollector>> {
let child = self.0.for_segment(segment_local_id, reader)?;
Ok(Box::new(SegmentCollectorWrapper(child)))
@@ -209,7 +209,7 @@ impl Collector for MultiCollector<'_> {
fn for_segment(
&self,
segment_local_id: SegmentOrdinal,
segment: &dyn SegmentReader,
segment: &SegmentReader,
) -> crate::Result<MultiCollectorChild> {
let children = self
.collector_wrappers

View File

@@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::schema::{OwnedValue, Schema};
use crate::{DocId, Order, Score, SegmentReader};
use crate::{DocId, Order, Score};
fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
match (lhs, rhs) {
@@ -430,7 +430,7 @@ where
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let child = self.0.segment_sort_key_computer(segment_reader)?;
Ok(SegmentSortKeyComputerWithComparator {
@@ -468,7 +468,7 @@ where
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let child = self.0.segment_sort_key_computer(segment_reader)?;
Ok(SegmentSortKeyComputerWithComparator {

View File

@@ -6,7 +6,7 @@ use crate::collector::sort_key::{
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::OwnedValue;
use crate::{DateTime, DocId, Score, SegmentReader};
use crate::{DateTime, DocId, Score};
/// Sort by the boxed / OwnedValue representation of either a fast field, or of the score.
///
@@ -86,7 +86,7 @@ impl SortKeyComputer for SortByErasedType {
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let inner: Box<dyn ErasedSegmentSortKeyComputer> = match self {
Self::Field(column_name) => {

View File

@@ -1,6 +1,6 @@
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
use crate::{DocAddress, DocId, Score, SegmentReader};
use crate::{DocAddress, DocId, Score};
/// Sort by similarity score.
#[derive(Clone, Debug, Copy)]
@@ -19,7 +19,7 @@ impl SortKeyComputer for SortBySimilarityScore {
fn segment_sort_key_computer(
&self,
_segment_reader: &dyn SegmentReader,
_segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
Ok(SortBySimilarityScore)
}
@@ -29,7 +29,7 @@ impl SortKeyComputer for SortBySimilarityScore {
&self,
k: usize,
weight: &dyn crate::query::Weight,
reader: &dyn SegmentReader,
reader: &crate::SegmentReader,
segment_ord: u32,
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
let mut top_n: TopNComputer<Score, DocId, Self::Comparator> =

View File

@@ -61,7 +61,7 @@ impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
let (sort_column, _sort_column_type) =

View File

@@ -3,7 +3,7 @@ use columnar::StrColumn;
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::termdict::TermOrdinal;
use crate::{DocId, Score, SegmentReader};
use crate::{DocId, Score};
/// Sort by the first value of a string column.
///
@@ -35,7 +35,7 @@ impl SortKeyComputer for SortByString {
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?;
Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt })

View File

@@ -119,7 +119,7 @@ pub trait SortKeyComputer: Sync {
&self,
k: usize,
weight: &dyn crate::query::Weight,
reader: &dyn SegmentReader,
reader: &crate::SegmentReader,
segment_ord: u32,
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
let with_scoring = self.requires_scoring();
@@ -135,7 +135,7 @@ pub trait SortKeyComputer: Sync {
}
/// Builds a child sort key computer for a specific segment.
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child>;
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
}
impl<HeadSortKeyComputer, TailSortKeyComputer> SortKeyComputer
@@ -156,7 +156,7 @@ where
(self.0.comparator(), self.1.comparator())
}
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((
self.0.segment_sort_key_computer(segment_reader)?,
self.1.segment_sort_key_computer(segment_reader)?,
@@ -357,7 +357,7 @@ where
)
}
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
@@ -420,7 +420,7 @@ where
SortKeyComputer4::Comparator,
);
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
@@ -454,7 +454,7 @@ where
impl<F, SegmentF, TSortKey> SortKeyComputer for F
where
F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> SegmentF,
F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF,
SegmentF: 'static + FnMut(DocId) -> TSortKey,
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
{
@@ -462,7 +462,7 @@ where
type Child = SegmentF;
type Comparator = NaturalComparator;
fn segment_sort_key_computer(&self, segment_reader: &dyn SegmentReader) -> Result<Self::Child> {
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((self)(segment_reader))
}
}
@@ -509,10 +509,10 @@ mod tests {
#[test]
fn test_lazy_score_computer() {
let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32;
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
let call_count = Arc::new(AtomicUsize::new(0));
let call_count_clone = call_count.clone();
let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| {
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
let call_count_new_clone = call_count_clone.clone();
move |_doc: DocId| {
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
@@ -572,10 +572,10 @@ mod tests {
#[test]
fn test_lazy_score_computer_dynamic_ordering() {
let score_computer_primary = |_segment_reader: &dyn SegmentReader| |_doc: DocId| 200u32;
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
let call_count = Arc::new(AtomicUsize::new(0));
let call_count_clone = call_count.clone();
let score_computer_secondary = move |_segment_reader: &dyn SegmentReader| {
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
let call_count_new_clone = call_count_clone.clone();
move |_doc: DocId| {
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);

View File

@@ -32,11 +32,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
self.sort_key_computer.check_schema(schema)
}
fn for_segment(
&self,
segment_ord: u32,
segment_reader: &dyn SegmentReader,
) -> Result<Self::Child> {
fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result<Self::Child> {
let segment_sort_key_computer = self
.sort_key_computer
.segment_sort_key_computer(segment_reader)?;
@@ -67,7 +63,7 @@ where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &dyn SegmentReader,
reader: &SegmentReader,
) -> crate::Result<Vec<(TSortKeyComputer::SortKey, DocAddress)>> {
let k = self.doc_range.end;
let docs = self

View File

@@ -5,7 +5,7 @@ use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, Index, Searcher, SegmentReader, TantivyDocument};
use crate::{DateTime, DocAddress, Index, Searcher, TantivyDocument};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
@@ -109,7 +109,7 @@ impl Collector for TestCollector {
fn for_segment(
&self,
segment_id: SegmentOrdinal,
_reader: &dyn SegmentReader,
_reader: &SegmentReader,
) -> crate::Result<TestSegmentCollector> {
Ok(TestSegmentCollector {
segment_id,
@@ -180,7 +180,7 @@ impl Collector for FastFieldTestCollector {
fn for_segment(
&self,
_: SegmentOrdinal,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<FastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
@@ -243,7 +243,7 @@ impl Collector for BytesFastFieldTestCollector {
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;
Ok(BytesFastFieldSegmentCollector {

View File

@@ -393,7 +393,7 @@ impl TopDocs {
/// // This is where we build our collector with our custom score.
/// let top_docs_by_custom_score = TopDocs
/// ::with_limit(10)
/// .tweak_score(move |segment_reader: &dyn SegmentReader| {
/// .tweak_score(move |segment_reader: &SegmentReader| {
/// // The argument is a function that returns our scoring
/// // function.
/// //
@@ -442,7 +442,7 @@ pub struct TweakScoreFn<F>(F);
impl<F, TTweakScoreSortKeyFn, TSortKey> SortKeyComputer for TweakScoreFn<F>
where
F: 'static + Send + Sync + Fn(&dyn SegmentReader) -> TTweakScoreSortKeyFn,
F: 'static + Send + Sync + Fn(&SegmentReader) -> TTweakScoreSortKeyFn,
TTweakScoreSortKeyFn: 'static + Fn(DocId, Score) -> TSortKey,
TweakScoreSegmentSortKeyComputer<TTweakScoreSortKeyFn>:
SegmentSortKeyComputer<SortKey = TSortKey, SegmentSortKey = TSortKey>,
@@ -458,7 +458,7 @@ where
fn segment_sort_key_computer(
&self,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
Ok({
TweakScoreSegmentSortKeyComputer {
@@ -1525,7 +1525,7 @@ mod tests {
let text_query = query_parser.parse_query("droopy tax")?;
let collector = TopDocs::with_limit(2)
.and_offset(1)
.order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc);
.order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
let score_docs: Vec<(u32, DocAddress)> =
index.reader()?.searcher().search(&text_query, &collector)?;
assert_eq!(
@@ -1543,7 +1543,7 @@ mod tests {
let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2)
.and_offset(1)
.order_by(move |_segment_reader: &dyn SegmentReader| move |doc: DocId| doc);
.order_by(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()

View File

@@ -4,7 +4,7 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::Executor;
use crate::index::{ArcSegmentReader, SegmentId, SegmentReader};
use crate::index::{SegmentId, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize;
use crate::schema::{Schema, Term};
@@ -36,7 +36,7 @@ pub struct SearcherGeneration {
impl SearcherGeneration {
pub(crate) fn from_segment_readers(
segment_readers: &[ArcSegmentReader],
segment_readers: &[SegmentReader],
generation_id: u64,
) -> Self {
let mut segment_id_to_del_opstamp = BTreeMap::new();
@@ -133,7 +133,7 @@ impl Searcher {
pub fn doc_freq(&self, term: &Term) -> crate::Result<u64> {
let mut total_doc_freq = 0;
for segment_reader in &self.inner.segment_readers {
let inverted_index = segment_reader.as_ref().inverted_index(term.field())?;
let inverted_index = segment_reader.inverted_index(term.field())?;
let doc_freq = inverted_index.doc_freq(term)?;
total_doc_freq += u64::from(doc_freq);
}
@@ -146,7 +146,7 @@ impl Searcher {
pub async fn doc_freq_async(&self, term: &Term) -> crate::Result<u64> {
let mut total_doc_freq = 0;
for segment_reader in &self.inner.segment_readers {
let inverted_index = segment_reader.as_ref().inverted_index(term.field())?;
let inverted_index = segment_reader.inverted_index(term.field())?;
let doc_freq = inverted_index.doc_freq_async(term).await?;
total_doc_freq += u64::from(doc_freq);
}
@@ -154,13 +154,13 @@ impl Searcher {
}
/// Return the list of segment readers
pub fn segment_readers(&self) -> &[ArcSegmentReader] {
pub fn segment_readers(&self) -> &[SegmentReader] {
&self.inner.segment_readers
}
/// Returns the segment_reader associated with the given segment_ord
pub fn segment_reader(&self, segment_ord: u32) -> &dyn SegmentReader {
self.inner.segment_readers[segment_ord as usize].as_ref()
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.inner.segment_readers[segment_ord as usize]
}
/// Runs a query on the segment readers wrapped by the searcher.
@@ -229,11 +229,7 @@ impl Searcher {
let segment_readers = self.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
collector.collect_segment(
weight.as_ref(),
segment_ord as u32,
segment_reader.as_ref(),
)
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
},
segment_readers.iter().enumerate(),
)?;
@@ -263,7 +259,7 @@ impl From<Arc<SearcherInner>> for Searcher {
pub(crate) struct SearcherInner {
schema: Schema,
index: Index,
segment_readers: Vec<ArcSegmentReader>,
segment_readers: Vec<SegmentReader>,
store_readers: Vec<StoreReader>,
generation: TrackedObject<SearcherGeneration>,
}
@@ -273,7 +269,7 @@ impl SearcherInner {
pub(crate) fn new(
schema: Schema,
index: Index,
segment_readers: Vec<ArcSegmentReader>,
segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
doc_store_cache_num_blocks: usize,
) -> io::Result<SearcherInner> {
@@ -305,7 +301,7 @@ impl fmt::Debug for Searcher {
let segment_ids = self
.segment_readers()
.iter()
.map(|segment_reader| segment_reader.segment_id())
.map(SegmentReader::segment_id)
.collect::<Vec<_>>();
write!(f, "Searcher({segment_ids:?})")
}

View File

@@ -676,7 +676,7 @@ mod tests {
let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4);
let num_components_except_deletes_and_tempstore =
crate::index::SegmentComponent::iterator().len() - 2;
crate::index::SegmentComponent::iterator().len() - 1;
let max_num_mmapped = num_components_except_deletes_and_tempstore * num_segments;
assert_eventually(|| {
let num_mmapped = mmap_directory.get_cache_info().mmapped.len();

View File

@@ -65,8 +65,8 @@ pub trait DocSet: Send {
/// `seek_danger(..)` until it returns `Found`, and get back to a valid state.
///
/// `seek_lower_bound` can be any `DocId` (in the docset or not) as long as it is in
/// `(target .. seek_result]` where `seek_result` is the first document in the docset greater
/// than to `target`.
/// `(target .. seek_result] U {TERMINATED}` where `seek_result` is the first document in the
/// docset greater than to `target`.
///
/// `seek_danger` may return `SeekLowerBound(TERMINATED)`.
///
@@ -98,7 +98,7 @@ pub trait DocSet: Send {
if doc == target {
SeekDangerResult::Found
} else {
SeekDangerResult::SeekLowerBound(self.doc())
SeekDangerResult::SeekLowerBound(doc)
}
}

View File

@@ -96,7 +96,7 @@ mod tests {
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{Index, IndexWriter};
use crate::{Index, IndexWriter, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -430,7 +430,7 @@ mod tests {
.searcher()
.segment_readers()
.iter()
.map(|segment_reader| segment_reader.segment_id())
.map(SegmentReader::segment_id)
.collect();
assert_eq!(segment_ids.len(), 2);
index_writer.merge(&segment_ids[..]).wait().unwrap();

View File

@@ -14,7 +14,7 @@ use crate::directory::error::OpenReadError;
use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::index::{IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, SegmentReader};
use crate::index::{IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory};
use crate::indexer::index_writer::{
IndexWriterOptions, MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN,
};
@@ -24,7 +24,7 @@ use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::TantivySegmentReader;
use crate::SegmentReader;
fn load_metas(
directory: &dyn Directory,
@@ -492,7 +492,7 @@ impl Index {
let segments = self.searchable_segments()?;
let fields_metadata: Vec<Vec<FieldMetadata>> = segments
.into_iter()
.map(|segment| TantivySegmentReader::open(&segment)?.fields_metadata())
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
.collect::<Result<_, _>>()?;
Ok(merge_field_meta_data(fields_metadata))
}

View File

@@ -1,8 +1,6 @@
use std::collections::HashSet;
use std::fmt;
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
@@ -37,7 +35,6 @@ impl SegmentMetaInventory {
let inner = InnerSegmentMeta {
segment_id,
max_doc,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
deletes: None,
};
SegmentMeta::from(self.inventory.track(inner))
@@ -85,15 +82,6 @@ impl SegmentMeta {
self.tracked.segment_id
}
/// Removes the Component::TempStore from the alive list and
/// therefore marks the temp docstore file to be deleted by
/// the garbage collection.
pub fn untrack_temp_docstore(&self) {
self.tracked
.include_temp_doc_store
.store(false, std::sync::atomic::Ordering::Relaxed);
}
/// Returns the number of deleted documents.
pub fn num_deleted_docs(&self) -> u32 {
self.tracked
@@ -111,20 +99,9 @@ impl SegmentMeta {
/// is by removing all files that have been created by tantivy
/// and are not used by any segment anymore.
pub fn list_files(&self) -> HashSet<PathBuf> {
if self
.tracked
.include_temp_doc_store
.load(std::sync::atomic::Ordering::Relaxed)
{
SegmentComponent::iterator()
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
} else {
SegmentComponent::iterator()
.filter(|comp| *comp != &SegmentComponent::TempStore)
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
}
SegmentComponent::iterator()
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
}
/// Returns the relative path of a component of our segment.
@@ -138,7 +115,6 @@ impl SegmentMeta {
SegmentComponent::Positions => ".pos".to_string(),
SegmentComponent::Terms => ".term".to_string(),
SegmentComponent::Store => ".store".to_string(),
SegmentComponent::TempStore => ".store.temp".to_string(),
SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
@@ -183,7 +159,6 @@ impl SegmentMeta {
segment_id: inner_meta.segment_id,
max_doc,
deletes: None,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
});
SegmentMeta { tracked }
}
@@ -202,7 +177,6 @@ impl SegmentMeta {
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
segment_id: inner_meta.segment_id,
max_doc: inner_meta.max_doc,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
deletes: Some(delete_meta),
});
SegmentMeta { tracked }
@@ -214,14 +188,6 @@ struct InnerSegmentMeta {
segment_id: SegmentId,
max_doc: u32,
pub deletes: Option<DeleteMeta>,
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
/// garbage collection and deleted, set this to true. This is used during merge.
#[serde(skip)]
#[serde(default = "default_temp_store")]
pub(crate) include_temp_doc_store: Arc<AtomicBool>,
}
fn default_temp_store() -> Arc<AtomicBool> {
Arc::new(AtomicBool::new(false))
}
impl InnerSegmentMeta {

View File

@@ -1,9 +1,4 @@
#[cfg(feature = "quickwit")]
use std::future::Future;
use std::io;
#[cfg(feature = "quickwit")]
use std::pin::Pin;
use std::sync::Arc;
use common::json_path_writer::JSON_END_OF_PATH;
use common::{BinarySerializable, ByteCount};
@@ -32,102 +27,7 @@ use crate::termdict::TermDictionary;
///
/// `InvertedIndexReader` are created by calling
/// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index).
pub trait InvertedIndexReader: Send + Sync {
/// Returns the term info associated with the term.
fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>>;
/// Return the term dictionary datastructure.
fn terms(&self) -> &TermDictionary;
/// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
#[doc(hidden)]
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>>;
/// Returns a block postings given a `Term`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
fn read_block_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<BlockSegmentPostings>>;
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings>;
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<SegmentPostings>;
/// Returns the total number of tokens recorded for all documents
/// (including deleted documents).
fn total_num_tokens(&self) -> u64;
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>>;
/// Returns the number of documents containing the term.
fn doc_freq(&self, term: &Term) -> io::Result<u32>;
/// Returns the number of documents containing the term asynchronously.
#[cfg(feature = "quickwit")]
fn doc_freq_async<'a>(&'a self, term: &'a Term) -> BoxFuture<'a, io::Result<u32>>;
/// Warmup a block postings given a `Term`.
/// This method is for an advanced usage only.
///
/// returns a boolean, whether the term was found in the dictionary
#[cfg(feature = "quickwit")]
fn warm_postings<'a>(
&'a self,
term: &'a Term,
with_positions: bool,
) -> BoxFuture<'a, io::Result<bool>>;
/// Warmup the block postings for all terms.
/// This method is for an advanced usage only.
///
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] or
/// [`Self::warm_postings`] instead.
#[cfg(feature = "quickwit")]
fn warm_postings_full<'a>(&'a self, with_positions: bool) -> BoxFuture<'a, io::Result<()>>;
}
/// Convenient alias for an atomically reference counted inverted index reader handle.
pub type ArcInvertedIndexReader = Arc<dyn InvertedIndexReader>;
#[cfg(feature = "quickwit")]
/// Boxed future used by async inverted index reader methods.
pub type BoxFuture<'a, T> = Pin<Box<dyn Future<Output = T> + Send + 'a>>;
/// The tantivy inverted index reader is in charge of accessing
/// the inverted index associated with a specific field.
///
/// This is the default implementation of [`InvertedIndexReader`].
pub struct TantivyInvertedIndexReader {
pub struct InvertedIndexReader {
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
@@ -136,16 +36,11 @@ pub struct TantivyInvertedIndexReader {
}
/// Object that records the amount of space used by a field in an inverted index.
pub struct InvertedIndexFieldSpace {
/// The JSON field name (without the parent field).
pub(crate) struct InvertedIndexFieldSpace {
pub field_name: String,
/// The field type encoded in the term dictionary.
pub field_type: Type,
/// Total postings size for this field.
pub postings_size: ByteCount,
/// Total positions size for this field.
pub positions_size: ByteCount,
/// Number of terms for this field.
pub num_terms: u64,
}
@@ -167,16 +62,16 @@ impl InvertedIndexFieldSpace {
}
}
impl TantivyInvertedIndexReader {
impl InvertedIndexReader {
pub(crate) fn new(
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
record_option: IndexRecordOption,
) -> io::Result<TantivyInvertedIndexReader> {
) -> io::Result<InvertedIndexReader> {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
Ok(TantivyInvertedIndexReader {
Ok(InvertedIndexReader {
termdict,
postings_file_slice: postings_body,
positions_file_slice,
@@ -187,8 +82,8 @@ impl TantivyInvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(record_option: IndexRecordOption) -> TantivyInvertedIndexReader {
TantivyInvertedIndexReader {
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_file_slice: FileSlice::empty(),
positions_file_slice: FileSlice::empty(),
@@ -265,6 +160,29 @@ impl TantivyInvertedIndexReader {
Ok(fields)
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a [`BlockSegmentPostings`].
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) -> io::Result<()> {
let postings_slice = self
.postings_file_slice
.slice(term_info.postings_range.clone());
let postings_bytes = postings_slice.read_bytes()?;
block_postings.reset(term_info.doc_freq, postings_bytes)?;
Ok(())
}
/// Returns a block postings given a `Term`.
/// This method is for an advanced usage only.
///
@@ -364,7 +282,7 @@ impl TantivyInvertedIndexReader {
}
#[cfg(feature = "quickwit")]
impl TantivyInvertedIndexReader {
impl InvertedIndexReader {
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get_async(term.serialized_value_bytes()).await
}
@@ -574,84 +492,3 @@ impl TantivyInvertedIndexReader {
.unwrap_or(0u32))
}
}
impl InvertedIndexReader for TantivyInvertedIndexReader {
fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
TantivyInvertedIndexReader::get_term_info(self, term)
}
fn terms(&self) -> &TermDictionary {
TantivyInvertedIndexReader::terms(self)
}
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
TantivyInvertedIndexReader::list_encoded_json_fields(self)
}
fn read_block_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<BlockSegmentPostings>> {
TantivyInvertedIndexReader::read_block_postings(self, term, option)
}
fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
TantivyInvertedIndexReader::read_block_postings_from_terminfo(
self,
term_info,
requested_option,
)
}
fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<SegmentPostings> {
TantivyInvertedIndexReader::read_postings_from_terminfo(self, term_info, option)
}
fn total_num_tokens(&self) -> u64 {
TantivyInvertedIndexReader::total_num_tokens(self)
}
fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
TantivyInvertedIndexReader::read_postings(self, term, option)
}
fn doc_freq(&self, term: &Term) -> io::Result<u32> {
TantivyInvertedIndexReader::doc_freq(self, term)
}
#[cfg(feature = "quickwit")]
fn doc_freq_async<'a>(&'a self, term: &'a Term) -> BoxFuture<'a, io::Result<u32>> {
Box::pin(async move { TantivyInvertedIndexReader::doc_freq_async(self, term).await })
}
#[cfg(feature = "quickwit")]
fn warm_postings<'a>(
&'a self,
term: &'a Term,
with_positions: bool,
) -> BoxFuture<'a, io::Result<bool>> {
Box::pin(async move {
TantivyInvertedIndexReader::warm_postings(self, term, with_positions).await
})
}
#[cfg(feature = "quickwit")]
fn warm_postings_full<'a>(&'a self, with_positions: bool) -> BoxFuture<'a, io::Result<()>> {
Box::pin(async move {
TantivyInvertedIndexReader::warm_postings_full(self, with_positions).await
})
}
}

View File

@@ -13,13 +13,8 @@ mod segment_reader;
pub use self::index::{Index, IndexBuilder};
pub(crate) use self::index_meta::SegmentMetaInventory;
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
pub use self::inverted_index_reader::{
ArcInvertedIndexReader, InvertedIndexFieldSpace, InvertedIndexReader,
TantivyInvertedIndexReader,
};
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::{
ArcSegmentReader, FieldMetadata, SegmentReader, TantivySegmentReader,
};
pub use self::segment_reader::{FieldMetadata, SegmentReader};

View File

@@ -23,8 +23,6 @@ pub enum SegmentComponent {
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
Store,
/// Temporary storage of the documents, before streamed to `Store`.
TempStore,
/// Bitset describing which document of the segment is alive.
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
Delete,
@@ -33,14 +31,13 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::Postings,
SegmentComponent::Positions,
SegmentComponent::FastFields,
SegmentComponent::FieldNorms,
SegmentComponent::Terms,
SegmentComponent::Store,
SegmentComponent::TempStore,
SegmentComponent::Delete,
];
SEGMENT_COMPONENTS.iter()

View File

@@ -9,10 +9,8 @@ use itertools::Itertools;
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fieldnorm::FieldNormReaders;
use crate::index::{
ArcInvertedIndexReader, Segment, SegmentComponent, SegmentId, TantivyInvertedIndexReader,
};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
@@ -20,93 +18,6 @@ use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::{DocId, Opstamp};
/// Abstraction over a segment reader for accessing all data structures of a segment.
///
/// This trait exists to decouple the query layer from the concrete on-disk layout. Alternative
/// codecs can implement it to expose their own segment representation.
pub trait SegmentReader: Send + Sync {
/// Highest document id ever attributed in this segment + 1.
fn max_doc(&self) -> DocId;
/// Number of alive documents. Deleted documents are not counted.
fn num_docs(&self) -> DocId;
/// Returns the schema of the index this segment belongs to.
fn schema(&self) -> &Schema;
/// Return the number of documents that have been deleted in the segment.
fn num_deleted_docs(&self) -> DocId {
self.max_doc() - self.num_docs()
}
/// Returns true if some of the documents of the segment have been deleted.
fn has_deletes(&self) -> bool {
self.num_deleted_docs() > 0
}
/// Accessor to a segment's fast field reader.
fn fast_fields(&self) -> &FastFieldReaders;
/// Accessor to the `FacetReader` associated with a given `Field`.
fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
let schema = self.schema();
let field = schema.get_field(field_name)?;
let field_entry = schema.get_field_entry(field);
if field_entry.field_type().value_type() != Type::Facet {
return Err(crate::TantivyError::SchemaError(format!(
"`{field_name}` is not a facet field.`"
)));
}
let Some(facet_column) = self.fast_fields().str(field_name)? else {
panic!("Facet Field `{field_name}` is missing. This should not happen");
};
Ok(FacetReader::new(facet_column))
}
/// Accessor to the segment's field norms readers container.
fn fieldnorms_readers(&self) -> &FieldNormReaders;
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader>;
/// Returns a field reader associated with the field given in argument.
fn inverted_index(&self, field: Field) -> crate::Result<ArcInvertedIndexReader>;
/// Returns the list of fields that have been indexed in the segment.
fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>>;
/// Returns the segment id
fn segment_id(&self) -> SegmentId;
/// Returns the delete opstamp
fn delete_opstamp(&self) -> Option<Opstamp>;
/// Returns the bitset representing the alive `DocId`s.
fn alive_bitset(&self) -> Option<&AliveBitSet>;
/// Returns true if the `doc` is marked as deleted.
fn is_deleted(&self, doc: DocId) -> bool {
self.alive_bitset()
.map(|alive_bitset| alive_bitset.is_deleted(doc))
.unwrap_or(false)
}
/// Returns an iterator that will iterate over the alive document ids
fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + Send + '_> {
if let Some(alive_bitset) = &self.alive_bitset() {
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc())
}
}
/// Summarize total space usage of this segment.
fn space_usage(&self) -> io::Result<SegmentSpaceUsage>;
}
/// Convenient alias for an atomically reference counted segment reader handle.
pub type ArcSegmentReader = Arc<dyn SegmentReader>;
/// Entry point to access all of the datastructures of the `Segment`
///
/// - term dictionary
@@ -118,8 +29,8 @@ pub type ArcSegmentReader = Arc<dyn SegmentReader>;
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
#[derive(Clone)]
pub struct TantivySegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, ArcInvertedIndexReader>>>,
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
delete_opstamp: Option<Opstamp>,
@@ -138,9 +49,98 @@ pub struct TantivySegmentReader {
schema: Schema,
}
impl TantivySegmentReader {
impl SegmentReader {
/// Returns the highest document id ever attributed in
/// this segment + 1.
pub fn max_doc(&self) -> DocId {
self.max_doc
}
/// Returns the number of alive documents.
/// Deleted documents are not counted.
pub fn num_docs(&self) -> DocId {
self.num_docs
}
/// Returns the schema of the index this segment belongs to.
pub fn schema(&self) -> &Schema {
&self.schema
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.max_doc - self.num_docs
}
/// Returns true if some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool {
self.num_deleted_docs() > 0
}
/// Accessor to a segment's fast field reader given a field.
///
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return a FastFieldNotAvailableError if the field is not
/// declared as a fast field in the schema.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn fast_fields(&self) -> &FastFieldReaders {
&self.fast_fields_readers
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
let schema = self.schema();
let field = schema.get_field(field_name)?;
let field_entry = schema.get_field_entry(field);
if field_entry.field_type().value_type() != Type::Facet {
return Err(crate::TantivyError::SchemaError(format!(
"`{field_name}` is not a facet field.`"
)));
}
let Some(facet_column) = self.fast_fields().str(field_name)? else {
panic!("Facet Field `{field_name}` is missing. This should not happen");
};
Ok(FacetReader::new(facet_column))
}
/// Accessor to the segment's `Field norms`'s reader.
///
/// Field norms are the length (in tokens) of the fields.
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {field_name:?}. Was the field set to record norm \
during indexing?"
);
crate::TantivyError::SchemaError(err_msg)
})
}
#[doc(hidden)]
pub fn fieldnorms_readers(&self) -> &FieldNormReaders {
&self.fieldnorm_readers
}
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
///
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
/// The size of blocks is configurable, this should be reflexted in the
pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone(), cache_num_blocks)
}
/// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<TantivySegmentReader> {
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
Self::open_with_custom_alive_set(segment, None)
}
@@ -148,7 +148,7 @@ impl TantivySegmentReader {
pub fn open_with_custom_alive_set(
segment: &Segment,
custom_bitset: Option<AliveBitSet>,
) -> crate::Result<TantivySegmentReader> {
) -> crate::Result<SegmentReader> {
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?;
@@ -190,7 +190,7 @@ impl TantivySegmentReader {
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
.unwrap_or(max_doc);
Ok(TantivySegmentReader {
Ok(SegmentReader {
inv_idx_reader_cache: Default::default(),
num_docs,
max_doc,
@@ -206,52 +206,6 @@ impl TantivySegmentReader {
schema,
})
}
}
impl SegmentReader for TantivySegmentReader {
/// Returns the highest document id ever attributed in
/// this segment + 1.
fn max_doc(&self) -> DocId {
self.max_doc
}
/// Returns the number of alive documents.
/// Deleted documents are not counted.
fn num_docs(&self) -> DocId {
self.num_docs
}
/// Returns the schema of the index this segment belongs to.
fn schema(&self) -> &Schema {
&self.schema
}
/// Accessor to a segment's fast field reader given a field.
///
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return a FastFieldNotAvailableError if the field is not
/// declared as a fast field in the schema.
///
/// # Panics
/// May panic if the index is corrupted.
fn fast_fields(&self) -> &FastFieldReaders {
&self.fast_fields_readers
}
#[doc(hidden)]
fn fieldnorms_readers(&self) -> &FieldNormReaders {
&self.fieldnorm_readers
}
/// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
///
/// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
/// The size of blocks is configurable, this should be reflexted in the
fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone(), cache_num_blocks)
}
/// Returns a field reader associated with the field given in argument.
/// If the field was not present in the index during indexing time,
@@ -265,7 +219,7 @@ impl SegmentReader for TantivySegmentReader {
/// is returned.
/// Similarly, if the field is marked as indexed but no term has been indexed for the given
/// index, an empty `InvertedIndexReader` is returned (but no warning is logged).
fn inverted_index(&self, field: Field) -> crate::Result<ArcInvertedIndexReader> {
pub fn inverted_index(&self, field: Field) -> crate::Result<Arc<InvertedIndexReader>> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
.read()
@@ -290,7 +244,7 @@ impl SegmentReader for TantivySegmentReader {
//
// Returns an empty inverted index.
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Ok(Arc::new(TantivyInvertedIndexReader::empty(record_option)));
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
}
let record_option = record_option_opt.unwrap();
@@ -314,7 +268,7 @@ impl SegmentReader for TantivySegmentReader {
DataCorruption::comment_only(error_msg)
})?;
let inv_idx_reader: ArcInvertedIndexReader = Arc::new(TantivyInvertedIndexReader::new(
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,
@@ -344,7 +298,7 @@ impl SegmentReader for TantivySegmentReader {
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
let mut map_to_canonical = FnvHashMap::default();
for (field, field_entry) in self.schema().fields() {
@@ -466,22 +420,39 @@ impl SegmentReader for TantivySegmentReader {
}
/// Returns the segment id
fn segment_id(&self) -> SegmentId {
pub fn segment_id(&self) -> SegmentId {
self.segment_id
}
/// Returns the delete opstamp
fn delete_opstamp(&self) -> Option<Opstamp> {
pub fn delete_opstamp(&self) -> Option<Opstamp> {
self.delete_opstamp
}
/// Returns the bitset representing the alive `DocId`s.
fn alive_bitset(&self) -> Option<&AliveBitSet> {
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
self.alive_bitset_opt.as_ref()
}
/// Returns true if the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
self.alive_bitset()
.map(|alive_bitset| alive_bitset.is_deleted(doc))
.unwrap_or(false)
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + Send + '_> {
if let Some(alive_bitset) = &self.alive_bitset_opt {
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc)
}
}
/// Summarize total space usage of this segment.
fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
Ok(SegmentSpaceUsage::new(
self.num_docs(),
self.termdict_composite.space_usage(self.schema()),
@@ -605,7 +576,7 @@ fn intersect_alive_bitset(
}
}
impl fmt::Debug for TantivySegmentReader {
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)
}

View File

@@ -250,15 +250,11 @@ mod tests {
struct DummyWeight;
impl Weight for DummyWeight {
fn scorer(
&self,
_reader: &dyn SegmentReader,
_boost: Score,
) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
}
fn explain(&self, _reader: &dyn SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
}
}

View File

@@ -12,9 +12,7 @@ use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit};
use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite};
use crate::error::TantivyError;
use crate::fastfield::write_alive_bitset;
use crate::index::{
Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader, TantivySegmentReader,
};
use crate::index::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader};
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
use crate::indexer::index_writer_status::IndexWriterStatus;
@@ -96,7 +94,7 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
fn compute_deleted_bitset(
alive_bitset: &mut BitSet,
segment_reader: &dyn SegmentReader,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: Opstamp,
@@ -145,7 +143,7 @@ pub fn advance_deletes(
return Ok(());
}
let segment_reader = TantivySegmentReader::open(&segment)?;
let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc();
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
@@ -220,7 +218,7 @@ fn index_documents<D: Document>(
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
segment_updater.schedule_add_segment(segment_entry).wait()?;
@@ -245,7 +243,7 @@ fn apply_deletes(
.max()
.expect("Empty DocOpstamp is forbidden");
let segment_reader = TantivySegmentReader::open(segment)?;
let segment_reader = SegmentReader::open(segment)?;
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();

View File

@@ -1,3 +1,5 @@
use std::sync::Arc;
use columnar::{
ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder,
};
@@ -10,14 +12,14 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::index::{Segment, SegmentComponent, SegmentReader, TantivySegmentReader};
use crate::index::{Segment, SegmentComponent, SegmentReader};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{ArcInvertedIndexReader, DocAddress, DocId};
use crate::{DocAddress, DocId, InvertedIndexReader};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
@@ -25,7 +27,7 @@ use crate::{ArcInvertedIndexReader, DocAddress, DocId};
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
fn estimate_total_num_tokens_in_single_segment(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field: Field,
) -> crate::Result<u64> {
// There are no deletes. We can simply use the exact value saved into the posting list.
@@ -66,7 +68,7 @@ fn estimate_total_num_tokens_in_single_segment(
Ok((segment_num_tokens as f64 * ratio) as u64)
}
fn estimate_total_num_tokens(readers: &[TantivySegmentReader], field: Field) -> crate::Result<u64> {
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_num_tokens: u64 = 0;
for reader in readers {
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
@@ -76,7 +78,7 @@ fn estimate_total_num_tokens(readers: &[TantivySegmentReader], field: Field) ->
pub struct IndexMerger {
schema: Schema,
pub(crate) readers: Vec<TantivySegmentReader>,
pub(crate) readers: Vec<SegmentReader>,
max_doc: u32,
}
@@ -168,10 +170,8 @@ impl IndexMerger {
let mut readers = vec![];
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) {
if segment.meta().num_docs() > 0 {
let reader = TantivySegmentReader::open_with_custom_alive_set(
segment,
new_alive_bitset_opt,
)?;
let reader =
SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?;
readers.push(reader);
}
}
@@ -204,20 +204,8 @@ impl IndexMerger {
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| {
reader
.fieldnorms_readers()
.get_field(field)?
.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {field_name:?}. Was the field set \
to record norm during indexing?"
);
crate::TantivyError::SchemaError(err_msg)
})
})
.collect::<crate::Result<_>>()?;
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let fieldnorms_reader = &fieldnorms_readers[old_doc_addr.segment_ord as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(old_doc_addr.doc_id);
@@ -274,7 +262,7 @@ impl IndexMerger {
}),
);
let has_deletes: bool = self.readers.iter().any(|reader| reader.has_deletes());
let has_deletes: bool = self.readers.iter().any(SegmentReader::has_deletes);
let mapping_type = if has_deletes {
MappingType::StackedWithDeletes
} else {
@@ -309,7 +297,7 @@ impl IndexMerger {
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
let field_readers: Vec<ArcInvertedIndexReader> = self
let field_readers: Vec<Arc<InvertedIndexReader>> = self
.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
@@ -378,7 +366,7 @@ impl IndexMerger {
// Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index = field_readers[segment_ord].as_ref();
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
let segment_postings = inverted_index
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
let alive_bitset_opt = segment_reader.alive_bitset();
@@ -1546,7 +1534,7 @@ mod tests {
for segment_reader in searcher.segment_readers() {
let mut term_scorer = term_query
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
.term_scorer_for_test(segment_reader.as_ref(), 1.0)?
.term_scorer_for_test(segment_reader, 1.0)?
.unwrap();
// the difference compared to before is intrinsic to the bm25 formula. no worries
// there.

View File

@@ -710,7 +710,7 @@ mod tests {
use crate::indexer::segment_updater::merge_filtered_segments;
use crate::query::QueryParser;
use crate::schema::*;
use crate::{Directory, DocAddress, Index, Segment, SegmentReader};
use crate::{Directory, DocAddress, Index, Segment};
#[test]
fn test_delete_during_merge() -> crate::Result<()> {

View File

@@ -871,7 +871,7 @@ mod tests {
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
fn assert_type(reader: &dyn SegmentReader, field: &str, typ: ColumnType) {
fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) {
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
assert_eq!(cols.len(), 1, "{field}");
assert_eq!(cols[0].column_type(), typ, "{field}");
@@ -890,7 +890,7 @@ mod tests {
assert_type(segment_reader, "json.my_arr", ColumnType::I64);
assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str);
fn assert_empty(reader: &dyn SegmentReader, field: &str) {
fn assert_empty(reader: &SegmentReader, field: &str) {
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
assert_eq!(cols.len(), 0);
}

View File

@@ -224,9 +224,8 @@ pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration};
pub use crate::directory::Directory;
pub use crate::index::{
ArcInvertedIndexReader, ArcSegmentReader, Index, IndexBuilder, IndexMeta, IndexSettings,
InvertedIndexReader, Order, Segment, SegmentMeta, SegmentReader, TantivyInvertedIndexReader,
TantivySegmentReader,
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
SegmentMeta, SegmentReader,
};
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term};
@@ -524,11 +523,11 @@ pub mod tests {
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.fieldnorms_readers().get_field(text_field)?.unwrap();
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.fieldnorms_readers().get_field(title_field)?.unwrap();
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?;
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
Ok(())
@@ -546,18 +545,15 @@ pub mod tests {
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &dyn SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader
.fieldnorms_readers()
.get_field(text_field)?
.unwrap();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
Ok(())
}
fn advance_undeleted(docset: &mut dyn DocSet, reader: &dyn SegmentReader) -> bool {
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
let mut doc = docset.advance();
while doc != TERMINATED {
if !reader.is_deleted(doc) {
@@ -1074,7 +1070,7 @@ pub mod tests {
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &dyn SegmentReader = searcher.segment_reader(0);
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_res = segment_reader.fast_fields().u64("text");
assert!(fast_field_reader_res.is_err());

View File

@@ -182,6 +182,32 @@ impl BlockSegmentPostings {
self.freq_reading_option
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> {
let (skip_data_opt, postings_data) =
split_into_skips_and_postings(doc_freq, postings_data)?;
self.data = postings_data;
self.block_max_score_cache = None;
self.block_loaded = false;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
self.skip_reader.reset(OwnedBytes::empty(), doc_freq);
}
self.doc_freq = doc_freq;
self.load_block();
Ok(())
}
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
///
@@ -277,10 +303,10 @@ impl BlockSegmentPostings {
}
pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset();
if self.block_is_loaded() {
return;
}
let offset = self.skip_reader.byte_offset();
match self.skip_reader.block_info() {
BlockInfo::BitPacked {
doc_num_bits,
@@ -495,4 +521,40 @@ mod tests {
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
Ok(())
}
#[test]
fn test_reset_block_segment_postings() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
// create two postings list, one containing even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc)?;
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term)?.unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
}
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term)?.unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?;
}
assert_eq!(block_segments.docs(), &[1, 3, 5]);
Ok(())
}
}

View File

@@ -46,7 +46,7 @@ pub(crate) mod tests {
use super::{InvertedIndexSerializer, Postings};
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::index::{Index, SegmentComponent, SegmentReader, TantivySegmentReader};
use crate::index::{Index, SegmentComponent, SegmentReader};
use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter;
use crate::query::Scorer;
@@ -258,12 +258,9 @@ pub(crate) mod tests {
segment_writer.finalize()?;
}
{
let segment_reader = TantivySegmentReader::open(&segment)?;
let segment_reader = SegmentReader::open(&segment)?;
{
let fieldnorm_reader = segment_reader
.fieldnorms_readers()
.get_field(text_field)?
.unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 {

View File

@@ -168,12 +168,20 @@ impl DocSet for SegmentPostings {
self.doc()
}
#[inline]
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(self.doc() <= target);
if self.doc() >= target {
return self.doc();
}
// As an optimization, if the block is already loaded, we can
// cheaply check the next doc.
self.cur = (self.cur + 1).min(COMPRESSION_BLOCK_SIZE - 1);
if self.doc() >= target {
return self.doc();
}
// Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
self.cur = self.block_cursor.seek(target);

View File

@@ -142,6 +142,23 @@ impl SkipReader {
skip_reader
}
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32;
self.owned_read = data;
self.block_info = BlockInfo::VInt { num_docs: doc_freq };
self.byte_offset = 0;
self.remaining_docs = doc_freq;
self.position_offset = 0u64;
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
}
}
// Returns the block max score for this block if available.
//
// The block max score is available for all full bitpacked block,

View File

@@ -21,7 +21,7 @@ impl Query for AllQuery {
pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer::new(reader.max_doc());
if boost != 1.0 {
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
@@ -30,7 +30,7 @@ impl Weight for AllWeight {
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
if doc >= reader.max_doc() {
return Err(does_not_match(doc));
}

View File

@@ -67,7 +67,7 @@ where
}
/// Returns the term infos that match the automaton
pub fn get_match_term_infos(&self, reader: &dyn SegmentReader) -> crate::Result<Vec<TermInfo>> {
pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result<Vec<TermInfo>> {
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict)?;
@@ -84,7 +84,7 @@ where
A: Automaton + Send + Sync + 'static,
A::State: Clone,
{
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field)?;
@@ -110,7 +110,7 @@ where
Ok(Box::new(const_scorer))
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0))

View File

@@ -205,7 +205,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
fn per_occur_scorers(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
@@ -221,7 +221,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
fn complex_scorer<TComplexScoreCombiner: ScoreCombiner>(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
) -> crate::Result<SpecializedScorer> {
@@ -418,7 +418,7 @@ fn remove_and_count_all_and_empty_scorers(
}
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let num_docs = reader.num_docs();
if self.weights.is_empty() {
Ok(Box::new(EmptyScorer))
@@ -442,7 +442,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -464,7 +464,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
fn for_each(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
@@ -486,7 +486,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
fn for_each_no_score(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
@@ -521,7 +521,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
fn for_each_pruning(
&self,
threshold: Score,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;

View File

@@ -67,11 +67,11 @@ impl BoostWeight {
}
impl Weight for BoostWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
self.weight.scorer(reader, boost * self.boost)
}
fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let underlying_explanation = self.weight.explain(reader, doc)?;
let score = underlying_explanation.value() * self.boost;
let mut explanation =
@@ -80,7 +80,7 @@ impl Weight for BoostWeight {
Ok(explanation)
}
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
}

View File

@@ -63,12 +63,12 @@ impl ConstWeight {
}
impl Weight for ConstWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let inner_scorer = self.weight.scorer(reader, boost)?;
Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score)))
}
fn explain(&self, reader: &dyn SegmentReader, doc: u32) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
@@ -81,7 +81,7 @@ impl Weight for ConstWeight {
Ok(explanation)
}
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
}

View File

@@ -26,11 +26,11 @@ impl Query for EmptyQuery {
/// It is useful for tests and handling edge cases.
pub struct EmptyWeight;
impl Weight for EmptyWeight {
fn scorer(&self, _reader: &dyn SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
Ok(Box::new(EmptyScorer))
}
fn explain(&self, _reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
Err(does_not_match(doc))
}
}

View File

@@ -98,7 +98,7 @@ pub struct ExistsWeight {
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
if self.field_type == Type::Json && self.json_subpaths {
@@ -165,7 +165,7 @@ impl Weight for ExistsWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));

View File

@@ -84,6 +84,14 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
docsets.sort_by_key(|docset| docset.cost());
go_to_first_doc(&mut docsets);
let left = docsets.remove(0);
debug_assert!({
let doc = left.doc();
if doc == TERMINATED {
true
} else {
docsets.iter().all(|docset| docset.doc() == doc)
}
});
let right = docsets.remove(0);
Intersection {
left,
@@ -112,30 +120,24 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
// Invariant:
// - candidate is always <= to the next document in the intersection.
// - candidate strictly increases at every occurence of the loop.
let mut candidate = 0;
let mut candidate = left.doc() + 1;
// Termination: candidate strictly increases.
'outer: while candidate < TERMINATED {
// As we enter the loop, we should always have candidate < next_doc.
// This step always increases candidate.
//
// TODO: Think about which value would make sense here
// It depends on the DocSet implementation, when a seek would outweigh an advance.
candidate = if candidate > left.doc().wrapping_add(100) {
left.seek(candidate)
} else {
left.advance()
};
candidate = left.seek(candidate);
// Left is positionned on `candidate`.
debug_assert_eq!(left.doc(), candidate);
if let SeekDangerResult::SeekLowerBound(seek_lower_bound) = right.seek_danger(candidate)
{
// The max is technically useless but it makes the invariant
// easier to proofread.
debug_assert!(seek_lower_bound >= candidate);
debug_assert!(
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
{candidate}"
);
candidate = seek_lower_bound;
continue;
}
@@ -148,7 +150,11 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
other.seek_danger(candidate)
{
// One of the scorer does not match, let's restart at the top of the loop.
debug_assert!(seek_lower_bound >= candidate);
debug_assert!(
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
{candidate}"
);
candidate = seek_lower_bound;
continue 'outer;
}
@@ -238,9 +244,12 @@ mod tests {
use proptest::prelude::*;
use super::Intersection;
use crate::collector::Count;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::VecDocSet;
use crate::query::{QueryParser, VecDocSet};
use crate::schema::{Schema, TEXT};
use crate::Index;
#[test]
fn test_intersection() {
@@ -411,4 +420,29 @@ mod tests {
assert_eq!(intersection.doc(), TERMINATED);
}
}
#[test]
fn test_bug_2811_intersection_candidate_should_increase() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc!(text_field=>"hello happy tax"))
.unwrap();
writer.add_document(doc!(text_field=>"hello")).unwrap();
writer.add_document(doc!(text_field=>"hello")).unwrap();
writer.add_document(doc!(text_field=>"happy tax")).unwrap();
writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, Vec::new());
let query = query_parser
.parse_query(r#"+text:hello +text:"happy tax""#)
.unwrap();
let searcher = index.reader().unwrap().searcher();
let c = searcher.search(&*query, &Count).unwrap();
assert_eq!(c, 1);
}
}

View File

@@ -32,7 +32,7 @@ impl PhrasePrefixWeight {
}
}
fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result<FieldNormReader> {
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
if self.similarity_weight_opt.is_some() {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
@@ -44,7 +44,7 @@ impl PhrasePrefixWeight {
pub(crate) fn phrase_scorer(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
let similarity_weight_opt = self
@@ -114,7 +114,7 @@ impl PhrasePrefixWeight {
}
impl Weight for PhrasePrefixWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
@@ -122,7 +122,7 @@ impl Weight for PhrasePrefixWeight {
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));

View File

@@ -531,7 +531,12 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
debug_assert!(target >= self.doc());
debug_assert!(
target >= self.doc(),
"target ({}) should be greater than or equal to doc ({})",
target,
self.doc()
);
let seek_res = self.intersection_docset.seek_danger(target);
if seek_res != SeekDangerResult::Found {
return seek_res;

View File

@@ -29,7 +29,7 @@ impl PhraseWeight {
}
}
fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result<FieldNormReader> {
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
if self.similarity_weight_opt.is_some() {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
@@ -41,7 +41,7 @@ impl PhraseWeight {
pub(crate) fn phrase_scorer(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight_opt = self
@@ -74,7 +74,7 @@ impl PhraseWeight {
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
@@ -82,7 +82,7 @@ impl Weight for PhraseWeight {
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));

View File

@@ -45,7 +45,7 @@ impl RegexPhraseWeight {
}
}
fn fieldnorm_reader(&self, reader: &dyn SegmentReader) -> crate::Result<FieldNormReader> {
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
if self.similarity_weight_opt.is_some() {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? {
return Ok(fieldnorm_reader);
@@ -56,7 +56,7 @@ impl RegexPhraseWeight {
pub(crate) fn phrase_scorer(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhraseScorer<UnionType>>> {
let similarity_weight_opt = self
@@ -84,8 +84,7 @@ impl RegexPhraseWeight {
"Phrase query exceeded max expansions {num_terms}"
)));
}
let union =
Self::get_union_from_term_infos(&term_infos, reader, inverted_index.as_ref())?;
let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?;
posting_lists.push((offset, union));
}
@@ -100,7 +99,7 @@ impl RegexPhraseWeight {
/// Add all docs of the term to the docset
fn add_to_bitset(
inverted_index: &dyn InvertedIndexReader,
inverted_index: &InvertedIndexReader,
term_info: &TermInfo,
doc_bitset: &mut BitSet,
) -> crate::Result<()> {
@@ -175,8 +174,8 @@ impl RegexPhraseWeight {
/// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently.
pub(crate) fn get_union_from_term_infos(
term_infos: &[TermInfo],
reader: &dyn SegmentReader,
inverted_index: &dyn InvertedIndexReader,
reader: &SegmentReader,
inverted_index: &InvertedIndexReader,
) -> crate::Result<UnionType> {
let max_doc = reader.max_doc();
@@ -270,7 +269,7 @@ impl RegexPhraseWeight {
}
impl Weight for RegexPhraseWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
@@ -278,7 +277,7 @@ impl Weight for RegexPhraseWeight {
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));

View File

@@ -146,7 +146,7 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
let weight = self.weight(EnableScoring::disabled_from_searcher(searcher))?;
let mut result = 0;
for reader in searcher.segment_readers() {
result += weight.count(reader.as_ref())? as usize;
result += weight.count(reader)? as usize;
}
Ok(result)
}

View File

@@ -212,7 +212,7 @@ impl InvertedIndexRangeWeight {
}
impl Weight for InvertedIndexRangeWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -245,7 +245,7 @@ impl Weight for InvertedIndexRangeWeight {
Ok(Box::new(ConstScorer::new(doc_bitset, boost)))
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -686,7 +686,7 @@ mod tests {
.weight(EnableScoring::disabled_from_schema(&schema))
.unwrap();
let range_scorer = range_weight
.scorer(searcher.segment_readers()[0].as_ref(), 1.0f32)
.scorer(&searcher.segment_readers()[0], 1.0f32)
.unwrap();
range_scorer
};

View File

@@ -52,7 +52,7 @@ impl FastFieldRangeWeight {
}
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
// Check if both bounds are Bound::Unbounded
if self.bounds.is_unbounded() {
return Ok(Box::new(AllScorer::new(reader.max_doc())));
@@ -219,7 +219,7 @@ impl Weight for FastFieldRangeWeight {
}
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
@@ -236,7 +236,7 @@ impl Weight for FastFieldRangeWeight {
///
/// Convert into fast field value space and search.
fn search_on_json_numerical_field(
reader: &dyn SegmentReader,
reader: &SegmentReader,
field_name: &str,
typ: Type,
bounds: BoundsRange<ValueBytes<Vec<u8>>>,

View File

@@ -105,6 +105,7 @@ impl DocSet for TermScorer {
#[inline]
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(target >= self.doc());
self.postings.seek(target)
}
@@ -263,9 +264,7 @@ mod tests {
let mut block_max_scores_b = vec![];
let mut docs = vec![];
{
let mut term_scorer = term_weight
.term_scorer_for_test(reader.as_ref(), 1.0)?
.unwrap();
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
while term_scorer.doc() != TERMINATED {
let mut score = term_scorer.score();
docs.push(term_scorer.doc());
@@ -279,9 +278,7 @@ mod tests {
}
}
{
let mut term_scorer = term_weight
.term_scorer_for_test(reader.as_ref(), 1.0)?
.unwrap();
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
for d in docs {
term_scorer.seek_block(d);
block_max_scores_b.push(term_scorer.block_max_score());

View File

@@ -34,11 +34,11 @@ impl TermOrEmptyOrAllScorer {
}
impl Weight for TermWeight {
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
Ok(self.specialized_scorer(reader, boost)?.into_boxed_scorer())
}
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
match self.specialized_scorer(reader, 1.0)? {
TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => {
if term_scorer.doc() > doc || term_scorer.seek(doc) != doc {
@@ -53,7 +53,7 @@ impl Weight for TermWeight {
}
}
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(alive_bitset) = reader.alive_bitset() {
Ok(self.scorer(reader, 1.0)?.count(alive_bitset))
} else {
@@ -68,7 +68,7 @@ impl Weight for TermWeight {
/// `DocSet` and push the scored documents to the collector.
fn for_each(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
match self.specialized_scorer(reader, 1.0)? {
@@ -87,7 +87,7 @@ impl Weight for TermWeight {
/// `DocSet` and push the scored documents to the collector.
fn for_each_no_score(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> {
match self.specialized_scorer(reader, 1.0)? {
@@ -118,7 +118,7 @@ impl Weight for TermWeight {
fn for_each_pruning(
&self,
threshold: Score,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let specialized_scorer = self.specialized_scorer(reader, 1.0)?;
@@ -166,7 +166,7 @@ impl TermWeight {
#[cfg(test)]
pub(crate) fn term_scorer_for_test(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<TermScorer>> {
let scorer = self.specialized_scorer(reader, boost)?;
@@ -178,7 +178,7 @@ impl TermWeight {
fn specialized_scorer(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<TermOrEmptyOrAllScorer> {
let field = self.term.field();
@@ -206,10 +206,7 @@ impl TermWeight {
)))
}
fn fieldnorm_reader(
&self,
segment_reader: &dyn SegmentReader,
) -> crate::Result<FieldNormReader> {
fn fieldnorm_reader(&self, segment_reader: &SegmentReader) -> crate::Result<FieldNormReader> {
if self.scoring_enabled {
if let Some(field_norm_reader) = segment_reader
.fieldnorms_readers()

View File

@@ -69,13 +69,13 @@ pub trait Weight: Send + Sync + 'static {
/// `boost` is a multiplier to apply to the score.
///
/// See [`Query`](crate::query::Query).
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>>;
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>>;
/// Returns an [`Explanation`] for the given document.
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation>;
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;
/// Returns the number documents within the given [`SegmentReader`].
fn count(&self, reader: &dyn SegmentReader) -> crate::Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
let mut scorer = self.scorer(reader, 1.0)?;
if let Some(alive_bitset) = reader.alive_bitset() {
Ok(scorer.count(alive_bitset))
@@ -88,7 +88,7 @@ pub trait Weight: Send + Sync + 'static {
/// `DocSet` and push the scored documents to the collector.
fn for_each(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?;
@@ -100,7 +100,7 @@ pub trait Weight: Send + Sync + 'static {
/// `DocSet` and push the scored documents to the collector.
fn for_each_no_score(
&self,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> {
let mut docset = self.scorer(reader, 1.0)?;
@@ -123,7 +123,7 @@ pub trait Weight: Send + Sync + 'static {
fn for_each_pruning(
&self,
threshold: Score,
reader: &dyn SegmentReader,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?;

View File

@@ -10,7 +10,7 @@ use self::warming::WarmingState;
use crate::core::searcher::{SearcherGeneration, SearcherInner};
use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{ArcSegmentReader, Index, Inventory, Searcher, TantivySegmentReader, TrackedObject};
use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject};
/// Defines when a new version of the index should be reloaded.
///
@@ -189,22 +189,19 @@ impl InnerIndexReader {
///
/// This function acquires a lock to prevent GC from removing files
/// as we are opening our index.
fn open_segment_readers(index: &Index) -> crate::Result<Vec<ArcSegmentReader>> {
fn open_segment_readers(index: &Index) -> crate::Result<Vec<SegmentReader>> {
// Prevents segment files from getting deleted while we are in the process of opening them
let _meta_lock = index.directory().acquire_lock(&META_LOCK)?;
let searchable_segments = index.searchable_segments()?;
let segment_readers = searchable_segments
.iter()
.map(|segment| {
TantivySegmentReader::open(segment)
.map(|reader| Arc::new(reader) as ArcSegmentReader)
})
.map(SegmentReader::open)
.collect::<crate::Result<_>>()?;
Ok(segment_readers)
}
fn track_segment_readers_in_inventory(
segment_readers: &[ArcSegmentReader],
segment_readers: &[SegmentReader],
searcher_generation_counter: &Arc<AtomicU64>,
searcher_generation_inventory: &Inventory<SearcherGeneration>,
) -> TrackedObject<SearcherGeneration> {

View File

@@ -210,11 +210,8 @@ mod tests {
index_writer.add_document(doc!(text=>"abc"))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let fieldnorm_opt = searcher
.segment_reader(0u32)
.fieldnorms_readers()
.get_field(text)?;
assert!(fieldnorm_opt.is_none());
let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text);
assert!(matches!(err, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}
}

View File

@@ -124,7 +124,6 @@ impl SegmentSpaceUsage {
FieldNorms => PerField(self.fieldnorms().clone()),
Terms => PerField(self.termdict().clone()),
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
Delete => Basic(self.deletes()),
}
}

View File

@@ -26,7 +26,7 @@
//! and should rely on either
//!
//! - at the segment level, the [`SegmentReader`'s `doc`
//! method](../trait.SegmentReader.html#method.doc)
//! method](../struct.SegmentReader.html#method.doc)
//! - at the index level, the [`Searcher::doc()`](crate::Searcher::doc) method
mod compressors;