From d8ea083177e3ad8a5a7a1150655c8669f0285b2b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 22 Feb 2017 18:38:58 +0900 Subject: [PATCH] Added block iterator for segment postings. --- src/core/segment_reader.rs | 30 ++++++++++-------- src/postings/mod.rs | 2 +- src/postings/segment_postings.rs | 52 ++++++++++++++++++-------------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 3860da881..22a6998c6 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -15,8 +15,7 @@ use std::fmt; use rustc_serialize::json; use core::SegmentInfo; use schema::Field; -use postings::SegmentPostingsOption; -use postings::SegmentPostings; +use postings::{SegmentPostings, BlockSegmentPostings, SegmentPostingsOption}; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; use schema::Schema; use schema::FieldType; @@ -165,16 +164,7 @@ impl SegmentReader { } - /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encounterred and indexed. - /// - /// If the field was not indexed with the indexing options that cover - /// the requested options, the returned `SegmentPostings` the method does not fail - /// and returns a `SegmentPostings` with as much information as possible. - /// - /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions` - /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies. - pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { + pub fn read_block_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(&term)); @@ -214,7 +204,21 @@ impl SegmentReader { FreqHandler::new_without_freq() } }; - Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler)) + Some(BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)) + } + + /// Returns the segment postings associated with the term, and with the given option, + /// or `None` if the term has never been encounterred and indexed. + /// + /// If the field was not indexed with the indexing options that cover + /// the requested options, the returned `SegmentPostings` the method does not fail + /// and returns a `SegmentPostings` with as much information as possible. + /// + /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions` + /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies. + pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { + self.read_block_postings(term, option) + .map(SegmentPostings::from_block_postings) } /// Returns the posting list associated with a term. diff --git a/src/postings/mod.rs b/src/postings/mod.rs index acf1db532..bf160bb52 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -30,7 +30,7 @@ pub use self::postings::Postings; #[cfg(test)] pub use self::vec_postings::VecPostings; pub use self::chained_postings::ChainedPostings; -pub use self::segment_postings::SegmentPostings; +pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index fccb31170..1be57ed2e 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -7,17 +7,32 @@ use std::num::Wrapping; const EMPTY_DATA: [u8; 0] = [0u8; 0]; -struct SegmentPostingsBlockCursor<'a> { +pub struct BlockSegmentPostings<'a> { num_binpacked_blocks: usize, num_vint_docs: usize, block_decoder: BlockDecoder, freq_handler: FreqHandler, remaining_data: &'a [u8], doc_offset: DocId, + len: usize, } -impl<'a> SegmentPostingsBlockCursor<'a> { - +impl<'a> BlockSegmentPostings<'a> { + + pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> { + let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; + let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; + BlockSegmentPostings { + num_binpacked_blocks: num_binpacked_blocks, + num_vint_docs: num_vint_docs, + block_decoder: BlockDecoder::new(), + freq_handler: freq_handler, + remaining_data: data, + doc_offset: 0, + len: len, + } + } + fn docs(&self) -> &[DocId] { self.block_decoder.output_array() } @@ -32,31 +47,32 @@ impl<'a> SegmentPostingsBlockCursor<'a> { self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); self.num_binpacked_blocks -= 1; - return true; + true } else { if self.num_vint_docs > 0 { self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs); self.freq_handler.read_freq_vint(self.remaining_data, self.num_vint_docs); self.num_vint_docs = 0; - return true; + true } else { - return false; + false } } } /// Returns an empty segment postings object - pub fn empty() -> SegmentPostingsBlockCursor<'static> { - SegmentPostingsBlockCursor { + pub fn empty() -> BlockSegmentPostings<'static> { + BlockSegmentPostings { num_binpacked_blocks: 0, num_vint_docs: 0, block_decoder: BlockDecoder::new(), freq_handler: FreqHandler::new_without_freq(), remaining_data: &EMPTY_DATA, doc_offset: 0, + len: 0, } } @@ -72,7 +88,7 @@ pub struct SegmentPostings<'a> { len: usize, // doc_offset: usize, cur: Wrapping, - block_cursor: SegmentPostingsBlockCursor<'a>, + block_cursor: BlockSegmentPostings<'a>, cur_block_len: usize } @@ -84,20 +100,10 @@ impl<'a> SegmentPostings<'a> { /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> { - let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK; - let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks; - let block_cursor = SegmentPostingsBlockCursor { - num_binpacked_blocks: num_binpacked_blocks, - num_vint_docs: num_vint_docs, - block_decoder: BlockDecoder::new(), - freq_handler: freq_handler, - remaining_data: data, - doc_offset: 0, - }; + pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>) -> SegmentPostings<'a> { SegmentPostings { - len: len as usize, - block_cursor: block_cursor, + len: segment_block_postings.len, + block_cursor: segment_block_postings, cur: Wrapping(usize::max_value()), cur_block_len: 0, } @@ -109,7 +115,7 @@ impl<'a> SegmentPostings<'a> { /// Returns an empty segment postings object pub fn empty() -> SegmentPostings<'static> { - let empty_block_cursor = SegmentPostingsBlockCursor::empty(); + let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { len: 0, block_cursor: empty_block_cursor,