From a5dc888cb9ecc4c239ccab51407dbb1dae324b94 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 6 Jan 2026 14:33:38 +0100 Subject: [PATCH] Removed BlockNotLoaded --- src/index/inverted_index_reader.rs | 25 ++++++------ src/postings/block_segment_postings.rs | 56 +++++++++----------------- src/postings/mod.rs | 1 - src/postings/segment_postings.rs | 21 +++++----- src/query/automaton_weight.rs | 10 +++-- src/query/range_query/range_query.rs | 7 +++- 6 files changed, 53 insertions(+), 67 deletions(-) diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs index 3ce4b555f..332f3cf01 100644 --- a/src/index/inverted_index_reader.rs +++ b/src/index/inverted_index_reader.rs @@ -11,9 +11,7 @@ use tantivy_fst::automaton::{AlwaysMatch, Automaton}; use crate::directory::FileSlice; use crate::positions::PositionReader; -use crate::postings::{ - BlockSegmentPostings, BlockSegmentPostingsNotLoaded, SegmentPostings, TermInfo, -}; +use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; use crate::schema::{IndexRecordOption, Term, Type}; use crate::termdict::TermDictionary; use crate::DocId; @@ -207,11 +205,12 @@ impl InvertedIndexReader { /// This method is for an advanced usage only. /// /// Most users should prefer using [`Self::read_postings()`] instead. - pub(crate) fn read_block_postings_from_terminfo_not_loaded( + pub(crate) fn read_block_postings_from_terminfo_with_seek( &self, term_info: &TermInfo, requested_option: IndexRecordOption, - ) -> io::Result { + seek_doc: DocId, + ) -> io::Result<(BlockSegmentPostings, usize)> { let postings_data = self .postings_file_slice .slice(term_info.postings_range.clone()); @@ -220,6 +219,7 @@ impl InvertedIndexReader { postings_data, self.record_option, requested_option, + seek_doc, ) } @@ -232,10 +232,9 @@ impl InvertedIndexReader { term_info: &TermInfo, requested_option: IndexRecordOption, ) -> io::Result { - let block_segment_postings_not_loaded = self - .read_block_postings_from_terminfo_not_loaded(term_info, requested_option)? - .load_at_start(); - Ok(block_segment_postings_not_loaded) + let (block_segment_postings, _) = + self.read_block_postings_from_terminfo_with_seek(term_info, requested_option, 0)?; + Ok(block_segment_postings) } /// Returns a posting object given a `term_info`. @@ -248,8 +247,8 @@ impl InvertedIndexReader { record_option: IndexRecordOption, seek_doc: DocId, ) -> io::Result { - let block_segment_postings_not_loaded = - self.read_block_postings_from_terminfo_not_loaded(term_info, record_option)?; + let (block_segment_postings, position_within_block) = + self.read_block_postings_from_terminfo_with_seek(term_info, record_option, seek_doc)?; let position_reader = { if record_option.has_positions() { let positions_data = self @@ -262,9 +261,9 @@ impl InvertedIndexReader { } }; Ok(SegmentPostings::from_block_postings( - block_segment_postings_not_loaded, + block_segment_postings, position_reader, - seek_doc, + position_within_block, )) } diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 6a91d4c36..88d4d7663 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -87,31 +87,6 @@ fn split_into_skips_and_postings( Ok((Some(skip_data), postings_data)) } -/// A block segment postings for which the first block has not been loaded yet. -/// -/// You can either call `load_at_start` to load it its first block, -/// or skip a few blocks by calling `seek_and_load`. -pub(crate) struct BlockSegmentPostingsNotLoaded(BlockSegmentPostings); - -impl BlockSegmentPostingsNotLoaded { - /// Seek into the block segment postings directly, possibly avoiding loading its first block. - pub fn seek_and_load(self, seek_doc: DocId) -> (BlockSegmentPostings, usize) { - let BlockSegmentPostingsNotLoaded(mut block_segment_postings) = self; - let inner_pos = if seek_doc == 0 { - block_segment_postings.load_block(); - 0 - } else { - block_segment_postings.seek(seek_doc) - }; - (block_segment_postings, inner_pos) - } - - /// Load the first block of segment postings. - pub fn load_at_start(self) -> BlockSegmentPostings { - self.seek_and_load(0u32).0 - } -} - impl BlockSegmentPostings { /// Opens a `BlockSegmentPostings`. /// `doc_freq` is the number of documents in the posting list. @@ -124,7 +99,8 @@ impl BlockSegmentPostings { data: FileSlice, mut record_option: IndexRecordOption, requested_option: IndexRecordOption, - ) -> io::Result { + seek_doc: DocId, + ) -> io::Result<(BlockSegmentPostings, usize)> { let bytes = data.read_bytes()?; let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?; let skip_reader = match skip_data_opt { @@ -150,7 +126,7 @@ impl BlockSegmentPostings { (_, _) => FreqReadingOption::ReadFreq, }; - Ok(BlockSegmentPostingsNotLoaded(BlockSegmentPostings { + let mut block_segment_postings: BlockSegmentPostings = BlockSegmentPostings { doc_decoder: BlockDecoder::with_val(TERMINATED), block_loaded: false, freq_decoder: BlockDecoder::with_val(1), @@ -159,7 +135,14 @@ impl BlockSegmentPostings { doc_freq, data: postings_data, skip_reader, - })) + }; + let inner_pos = if seek_doc == 0 { + block_segment_postings.load_block(); + 0 + } else { + block_segment_postings.seek(seek_doc) + }; + Ok((block_segment_postings, inner_pos)) } /// Returns the block_max_score for the current block. @@ -413,7 +396,7 @@ mod tests { use crate::index::Index; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::postings::Postings; - use crate::postings::{BlockSegmentPostingsNotLoaded, SegmentPostings}; + use crate::postings::SegmentPostings; use crate::schema::{IndexRecordOption, Schema, Term, INDEXED}; use crate::DocId; @@ -452,8 +435,7 @@ mod tests { #[test] fn test_block_segment_postings() -> crate::Result<()> { - let mut block_segments = - build_block_postings(&(0..100_000).collect::>())?.load_at_start(); + let mut block_segments = build_block_postings(&(0..100_000).collect::>())?; let mut offset: u32 = 0u32; // checking that the `doc_freq` is correct assert_eq!(block_segments.doc_freq(), 100_000); @@ -487,7 +469,7 @@ mod tests { assert_eq!(docset.advance(), TERMINATED); } { - let block_segments = build_block_postings(&doc_ids).unwrap(); + let block_segments = build_block_postings(&doc_ids)?; let mut docset = SegmentPostings::from_block_postings(block_segments, None, 0); assert_eq!(docset.seek(129), 129); assert_eq!(docset.doc(), 129); @@ -505,7 +487,7 @@ mod tests { Ok(()) } - fn build_block_postings(docs: &[DocId]) -> crate::Result { + fn build_block_postings(docs: &[DocId]) -> crate::Result { let mut schema_builder = Schema::builder(); let int_field = schema_builder.add_u64_field("id", INDEXED); let schema = schema_builder.build(); @@ -525,9 +507,9 @@ mod tests { let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let term = Term::from_field_u64(int_field, 0u64); let term_info = inverted_index.get_term_info(&term)?.unwrap(); - let block_postings_not_loaded = inverted_index - .read_block_postings_from_terminfo_not_loaded(&term_info, IndexRecordOption::Basic)?; - Ok(block_postings_not_loaded) + let block_postings = inverted_index + .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; + Ok(block_postings) } #[test] @@ -536,7 +518,7 @@ mod tests { for i in 0..1300 { docs.push((i * i / 100) + i); } - let mut block_postings = build_block_postings(&docs[..])?.load_at_start(); + let mut block_postings = build_block_postings(&docs[..])?; for i in &[0, 424, 10000] { block_postings.seek(*i); let docs = block_postings.docs(); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index e5dd49d1f..efc0e069d 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -22,7 +22,6 @@ pub(crate) use loaded_postings::LoadedPostings; pub(crate) use stacker::compute_table_memory_size; pub use self::block_segment_postings::BlockSegmentPostings; -pub(crate) use self::block_segment_postings::BlockSegmentPostingsNotLoaded; pub(crate) use self::indexing_context::IndexingContext; pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter; pub use self::postings::Postings; diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 40eaeab4f..7941f33a8 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -4,7 +4,7 @@ use crate::docset::DocSet; use crate::fastfield::AliveBitSet; use crate::positions::PositionReader; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; -use crate::postings::{BlockSegmentPostings, BlockSegmentPostingsNotLoaded, Postings}; +use crate::postings::{BlockSegmentPostings, Postings}; use crate::{DocId, TERMINATED}; /// `SegmentPostings` represents the inverted list or postings associated with @@ -79,14 +79,15 @@ impl SegmentPostings { .close_term(docs.len() as u32) .expect("In memory Serialization should never fail."); } - let block_segment_postings = BlockSegmentPostings::open( + let (block_segment_postings, position_within_block) = BlockSegmentPostings::open( docs.len() as u32, FileSlice::from(buffer), IndexRecordOption::Basic, IndexRecordOption::Basic, + 0u32, ) .unwrap(); - SegmentPostings::from_block_postings(block_segment_postings, None, 0) + SegmentPostings::from_block_postings(block_segment_postings, None, position_within_block) } /// Helper functions to create `SegmentPostings` for tests. @@ -127,14 +128,15 @@ impl SegmentPostings { postings_serializer .close_term(doc_and_tfs.len() as u32) .unwrap(); - let block_segment_postings = BlockSegmentPostings::open( + let (block_segment_postings, position_within_block) = BlockSegmentPostings::open( doc_and_tfs.len() as u32, FileSlice::from(buffer), IndexRecordOption::WithFreqs, IndexRecordOption::WithFreqs, + 0u32, ) .unwrap(); - SegmentPostings::from_block_postings(block_segment_postings, None, 0) + SegmentPostings::from_block_postings(block_segment_postings, None, position_within_block) } /// Creates a Segment Postings from a @@ -142,14 +144,13 @@ impl SegmentPostings { /// - a position reader /// - a target document to seek to pub(crate) fn from_block_postings( - segment_block_postings: BlockSegmentPostingsNotLoaded, + segment_block_postings: BlockSegmentPostings, position_reader: Option, - seek_doc: DocId, + position_within_block: usize, ) -> SegmentPostings { - let (block_cursor, cur) = segment_block_postings.seek_and_load(seek_doc); SegmentPostings { - block_cursor, - cur, + block_cursor: segment_block_postings, + cur: position_within_block, position_reader, } } diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 1763abe78..04d5c219c 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -97,10 +97,12 @@ where let mut term_stream = self.automaton_stream(term_dict)?; while term_stream.advance() { let term_info = term_stream.value(); - let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo_not_loaded(term_info, IndexRecordOption::Basic)? - .seek_and_load(seek_doc) - .0; + let (mut block_segment_postings, _) = inverted_index + .read_block_postings_from_terminfo_with_seek( + term_info, + IndexRecordOption::Basic, + seek_doc, + )?; loop { let docs = block_segment_postings.docs(); if docs.is_empty() { diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index ba80c8398..10a44618f 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -234,8 +234,11 @@ impl Weight for InvertedIndexRangeWeight { processed_count += 1; let term_info = term_range.value(); let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo_not_loaded(term_info, IndexRecordOption::Basic)? - .seek_and_load(seek_doc) + .read_block_postings_from_terminfo_with_seek( + term_info, + IndexRecordOption::Basic, + seek_doc, + )? .0; loop { let docs = block_segment_postings.docs();