Bugfix when requesting no termfreq.

This commit is contained in:
Paul Masurel
2018-02-17 22:41:12 +09:00
parent 4640ab4e65
commit 20bede9462
7 changed files with 64 additions and 29 deletions

View File

@@ -5,6 +5,13 @@ mod stream;
pub use self::stream::CompressedIntStream;
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
pub(crate) fn compute_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
}
#[cfg(not(feature = "simdcompression"))]
mod pack {
mod compression_pack_nosimd;
@@ -112,7 +119,6 @@ impl VIntDecoder for BlockDecoder {
}
}
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
#[cfg(test)]
pub mod tests {

View File

@@ -3,7 +3,7 @@ use common::bitpacker::{BitPacker, BitUnpacker};
use common::CountingWriter;
use std::cmp;
use std::io::Write;
use super::super::COMPRESSION_BLOCK_SIZE;
use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
@@ -29,7 +29,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
.write(*val as u64, num_bits, &mut counting_writer)
.unwrap();
}
counting_writer.written_bytes()
let compressed_size = counting_writer.written_bytes();
assert_eq!(compressed_size, compute_block_size(num_bits));
compressed_size
}
pub struct BlockEncoder {
@@ -117,7 +119,7 @@ impl BlockDecoder {
self.output[i] = val;
offset = val;
}
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
compute_block_size(num_bits)
};
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size

View File

@@ -1,4 +1,4 @@
use super::super::COMPRESSION_BLOCK_SIZE;
use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
@@ -51,11 +51,13 @@ impl BlockEncoder {
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
let compressed_size = compress_sorted(vals, &mut self.output, offset);
assert_eq!(compressed_size, compute_block_size(self.output[0]));
&self.output[..compressed_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size = compress_unsorted(vals, &mut self.output);
assert_eq!(compressed_size, compute_block_size(self.output[0]));
&self.output[..compressed_size]
}
}

View File

@@ -8,6 +8,7 @@ use std::cmp;
use fastfield::DeleteBitSet;
use schema::Schema;
use compression::CompressedIntStream;
use postings::FreqReadingOption;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
@@ -27,7 +28,7 @@ pub struct InvertedIndexReader {
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
record_option: IndexRecordOption
}
impl InvertedIndexReader {
@@ -36,14 +37,14 @@ impl InvertedIndexReader {
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
record_option: IndexRecordOption,
) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
postings_source,
positions_source,
delete_bitset,
schema,
record_option
}
}
@@ -86,15 +87,19 @@ impl InvertedIndexReader {
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
requested_option: IndexRecordOption
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let has_freq = option.has_freq();
let freq_reading_option = match (self.record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq
};
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
has_freq,
freq_reading_option
)
}
@@ -135,11 +140,8 @@ impl InvertedIndexReader {
/// with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_index_record_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
Some(self.read_postings_from_terminfo(&term_info, option))
}
/// Returns the number of documents containing the term.

View File

@@ -37,6 +37,8 @@ use fastfield::MultiValueIntFastFieldReader;
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
///
///
/// TODO fix not decoding docfreq
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
@@ -223,9 +225,14 @@ impl SegmentReader {
return Arc::clone(inv_idx_reader);
}
let record_option = self.schema.get_field_entry(field).field_type()
.get_index_record_option()
.expect("Field does not seem indexed.");
let termdict_source: ReadOnlySource = self.termdict_composite
.open_read(field)
.expect("Index corrupted. Failed to open field term dictionary in composite file.");
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
let postings_source = self.postings_composite
.open_read(field)
@@ -240,7 +247,7 @@ impl SegmentReader {
postings_source,
positions_source,
self.delete_bitset.clone(),
self.schema.clone(),
record_option
));
// by releasing the lock in between, we may end up opening the inverting index

View File

@@ -37,6 +37,12 @@ pub use common::HasLen;
pub(crate) type UnorderedTermId = usize;
pub(crate) enum FreqReadingOption {
NoFreq,
SkipFreq,
ReadFreq
}
#[cfg(test)]
pub mod tests {

View File

@@ -4,9 +4,11 @@ use common::BitSet;
use postings::{DocSet, HasLen, Postings, SkipResult};
use std::cmp;
use fst::Streamer;
use compression::compute_block_size;
use fastfield::DeleteBitSet;
use std::cell::UnsafeCell;
use directory::{ReadOnlySource, SourceRead};
use postings::FreqReadingOption;
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
@@ -299,7 +301,7 @@ impl Postings for SegmentPostings {
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
has_freq: bool,
freq_reading_option: FreqReadingOption,
doc_freq: usize,
doc_offset: DocId,
@@ -312,7 +314,7 @@ impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
has_freq: bool,
freq_reading_option: FreqReadingOption,
) -> BlockSegmentPostings {
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_bitpacked_blocks;
@@ -321,7 +323,7 @@ impl BlockSegmentPostings {
num_vint_docs,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq,
freq_reading_option,
remaining_data: data,
doc_offset: 0,
doc_freq,
@@ -401,11 +403,16 @@ impl BlockSegmentPostings {
let num_consumed_bytes = self.doc_decoder
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
self.remaining_data.advance(num_consumed_bytes);
if self.has_freq {
let num_consumed_bytes = self.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref());
self.remaining_data.advance(num_consumed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compute_block_size(self.remaining_data.as_ref()[0]);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref());
self.remaining_data.advance(num_consumed_bytes);
}
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
@@ -418,9 +425,12 @@ impl BlockSegmentPostings {
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
if self.has_freq {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
true
@@ -437,7 +447,7 @@ impl BlockSegmentPostings {
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: false,
freq_reading_option: FreqReadingOption::NoFreq,
remaining_data: From::from(ReadOnlySource::empty()),
doc_offset: 0,