Bugfix when requesting no termfreq.

2026-01-06 09:12:55 +00:00 · 2018-02-17 22:41:12 +09:00
parent 4640ab4e65
commit 20bede9462
7 changed files with 64 additions and 29 deletions
--- a/src/compression/mod.rs
+++ b/src/compression/mod.rs
@@ -5,6 +5,13 @@ mod stream;

 pub use self::stream::CompressedIntStream;

+
+pub const COMPRESSION_BLOCK_SIZE: usize = 128;
+
+pub(crate) fn compute_block_size(num_bits: u8) -> usize {
+    1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
+}
+
 #[cfg(not(feature = "simdcompression"))]
 mod pack {
    mod compression_pack_nosimd;
@@ -112,7 +119,6 @@ impl VIntDecoder for BlockDecoder {
    }
 }

-pub const COMPRESSION_BLOCK_SIZE: usize = 128;

 #[cfg(test)]
 pub mod tests {
--- a/src/compression/pack/compression_pack_nosimd.rs
+++ b/src/compression/pack/compression_pack_nosimd.rs
@@ -3,7 +3,7 @@ use common::bitpacker::{BitPacker, BitUnpacker};
 use common::CountingWriter;
 use std::cmp;
 use std::io::Write;
-use super::super::COMPRESSION_BLOCK_SIZE;
+use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};

 const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;

@@ -29,7 +29,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
            .write(*val as u64, num_bits, &mut counting_writer)
            .unwrap();
    }
-    counting_writer.written_bytes()
+    let compressed_size = counting_writer.written_bytes();
+    assert_eq!(compressed_size, compute_block_size(num_bits));
+    compressed_size
 }

 pub struct BlockEncoder {
@@ -117,7 +119,7 @@ impl BlockDecoder {
                self.output[i] = val;
                offset = val;
            }
-            1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
+            compute_block_size(num_bits)
        };
        self.output_len = COMPRESSION_BLOCK_SIZE;
        consumed_size
--- a/src/compression/pack/compression_pack_simd.rs
+++ b/src/compression/pack/compression_pack_simd.rs
@@ -1,4 +1,4 @@
-use super::super::COMPRESSION_BLOCK_SIZE;
+use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};

 const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;

@@ -51,11 +51,13 @@ impl BlockEncoder {

    pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
        let compressed_size = compress_sorted(vals, &mut self.output, offset);
+        assert_eq!(compressed_size, compute_block_size(self.output[0]));
        &self.output[..compressed_size]
    }

    pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
        let compressed_size = compress_unsorted(vals, &mut self.output);
+        assert_eq!(compressed_size, compute_block_size(self.output[0]));
        &self.output[..compressed_size]
    }
 }
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -8,6 +8,7 @@ use std::cmp;
 use fastfield::DeleteBitSet;
 use schema::Schema;
 use compression::CompressedIntStream;
+use postings::FreqReadingOption;

 /// The inverted index reader is in charge of accessing
 /// the inverted index associated to a specific field.
@@ -27,7 +28,7 @@ pub struct InvertedIndexReader {
    postings_source: ReadOnlySource,
    positions_source: ReadOnlySource,
    delete_bitset: DeleteBitSet,
-    schema: Schema,
+    record_option: IndexRecordOption
 }

 impl InvertedIndexReader {
@@ -36,14 +37,14 @@ impl InvertedIndexReader {
        postings_source: ReadOnlySource,
        positions_source: ReadOnlySource,
        delete_bitset: DeleteBitSet,
-        schema: Schema,
+        record_option: IndexRecordOption,
    ) -> InvertedIndexReader {
        InvertedIndexReader {
            termdict: TermDictionaryImpl::from_source(termdict_source),
            postings_source,
            positions_source,
            delete_bitset,
-            schema,
+            record_option
        }
    }

@@ -86,15 +87,19 @@ impl InvertedIndexReader {
    pub fn read_block_postings_from_terminfo(
        &self,
        term_info: &TermInfo,
-        option: IndexRecordOption,
+        requested_option: IndexRecordOption
    ) -> BlockSegmentPostings {
        let offset = term_info.postings_offset as usize;
        let postings_data = self.postings_source.slice_from(offset);
-        let has_freq = option.has_freq();
+        let freq_reading_option = match (self.record_option, requested_option) {
+            (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
+            (_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
+            (_, _) => FreqReadingOption::ReadFreq
+        };
        BlockSegmentPostings::from_data(
            term_info.doc_freq as usize,
            SourceRead::from(postings_data),
-            has_freq,
+            freq_reading_option
        )
    }

@@ -135,11 +140,8 @@ impl InvertedIndexReader {
    /// with `DocId`s and frequencies.
    pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
        let field = term.field();
-        let field_entry = self.schema.get_field_entry(field);
        let term_info = get!(self.get_term_info(term));
-        let maximum_option = get!(field_entry.field_type().get_index_record_option());
-        let best_effort_option = cmp::min(maximum_option, option);
-        Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
+        Some(self.read_postings_from_terminfo(&term_info, option))
    }

    /// Returns the number of documents containing the term.
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -37,6 +37,8 @@ use fastfield::MultiValueIntFastFieldReader;
 /// The segment reader has a very low memory footprint,
 /// as close to all of the memory data is mmapped.
 ///
+///
+/// TODO fix not decoding docfreq
 #[derive(Clone)]
 pub struct SegmentReader {
    inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
@@ -223,9 +225,14 @@ impl SegmentReader {
            return Arc::clone(inv_idx_reader);
        }

+
+        let record_option = self.schema.get_field_entry(field).field_type()
+            .get_index_record_option()
+            .expect("Field does not seem indexed.");
+
        let termdict_source: ReadOnlySource = self.termdict_composite
            .open_read(field)
-            .expect("Index corrupted. Failed to open field term dictionary in composite file.");
+            .expect("Failed to open field term dictionary in composite file. Is the field indexed");

        let postings_source = self.postings_composite
            .open_read(field)
@@ -240,7 +247,7 @@ impl SegmentReader {
            postings_source,
            positions_source,
            self.delete_bitset.clone(),
-            self.schema.clone(),
+            record_option
        ));

        // by releasing the lock in between, we may end up opening the inverting index
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -37,6 +37,12 @@ pub use common::HasLen;

 pub(crate) type UnorderedTermId = usize;

+pub(crate) enum FreqReadingOption {
+    NoFreq,
+    SkipFreq,
+    ReadFreq
+}
+
 #[cfg(test)]
 pub mod tests {

--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -4,9 +4,11 @@ use common::BitSet;
 use postings::{DocSet, HasLen, Postings, SkipResult};
 use std::cmp;
 use fst::Streamer;
+use compression::compute_block_size;
 use fastfield::DeleteBitSet;
 use std::cell::UnsafeCell;
 use directory::{ReadOnlySource, SourceRead};
+use postings::FreqReadingOption;

 const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];

@@ -299,7 +301,7 @@ impl Postings for SegmentPostings {
 pub struct BlockSegmentPostings {
    doc_decoder: BlockDecoder,
    freq_decoder: BlockDecoder,
-    has_freq: bool,
+    freq_reading_option: FreqReadingOption,

    doc_freq: usize,
    doc_offset: DocId,
@@ -312,7 +314,7 @@ impl BlockSegmentPostings {
    pub(crate) fn from_data(
        doc_freq: usize,
        data: SourceRead,
-        has_freq: bool,
+        freq_reading_option: FreqReadingOption,
    ) -> BlockSegmentPostings {
        let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
        let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_bitpacked_blocks;
@@ -321,7 +323,7 @@ impl BlockSegmentPostings {
            num_vint_docs,
            doc_decoder: BlockDecoder::new(),
            freq_decoder: BlockDecoder::with_val(1),
-            has_freq,
+            freq_reading_option,
            remaining_data: data,
            doc_offset: 0,
            doc_freq,
@@ -401,11 +403,16 @@ impl BlockSegmentPostings {
            let num_consumed_bytes = self.doc_decoder
                .uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
            self.remaining_data.advance(num_consumed_bytes);
-
-            if self.has_freq {
-                let num_consumed_bytes = self.freq_decoder
-                    .uncompress_block_unsorted(self.remaining_data.as_ref());
-                self.remaining_data.advance(num_consumed_bytes);
+            match self.freq_reading_option {
+                FreqReadingOption::NoFreq => {}
+                FreqReadingOption::SkipFreq => {
+                    let num_bytes_to_skip = compute_block_size(self.remaining_data.as_ref()[0]);
+                    self.remaining_data.advance(num_bytes_to_skip);
+                }
+                FreqReadingOption::ReadFreq => {
+                    let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref());
+                    self.remaining_data.advance(num_consumed_bytes);
+                }
            }
            // it will be used as the next offset.
            self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
@@ -418,9 +425,12 @@ impl BlockSegmentPostings {
                self.num_vint_docs,
            );
            self.remaining_data.advance(num_compressed_bytes);
-            if self.has_freq {
-                self.freq_decoder
-                    .uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
+            match self.freq_reading_option {
+                FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
+                FreqReadingOption::ReadFreq => {
+                    self.freq_decoder
+                        .uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
+                }
            }
            self.num_vint_docs = 0;
            true
@@ -437,7 +447,7 @@ impl BlockSegmentPostings {

            doc_decoder: BlockDecoder::new(),
            freq_decoder: BlockDecoder::with_val(1),
-            has_freq: false,
+            freq_reading_option: FreqReadingOption::NoFreq,

            remaining_data: From::from(ReadOnlySource::empty()),
            doc_offset: 0,