mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
Bugfix when requesting no termfreq.
This commit is contained in:
@@ -5,6 +5,13 @@ mod stream;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
pub(crate) fn compute_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "simdcompression"))]
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
@@ -112,7 +119,6 @@ impl VIntDecoder for BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -3,7 +3,7 @@ use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use common::CountingWriter;
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
@@ -29,7 +29,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
|
||||
.write(*val as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
counting_writer.written_bytes()
|
||||
let compressed_size = counting_writer.written_bytes();
|
||||
assert_eq!(compressed_size, compute_block_size(num_bits));
|
||||
compressed_size
|
||||
}
|
||||
|
||||
pub struct BlockEncoder {
|
||||
@@ -117,7 +119,7 @@ impl BlockDecoder {
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
|
||||
compute_block_size(num_bits)
|
||||
};
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
use super::super::{compute_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
@@ -51,11 +51,13 @@ impl BlockEncoder {
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
let compressed_size = compress_sorted(vals, &mut self.output, offset);
|
||||
assert_eq!(compressed_size, compute_block_size(self.output[0]));
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size = compress_unsorted(vals, &mut self.output);
|
||||
assert_eq!(compressed_size, compute_block_size(self.output[0]));
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::cmp;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::Schema;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -27,7 +28,7 @@ pub struct InvertedIndexReader {
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
schema: Schema,
|
||||
record_option: IndexRecordOption
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
@@ -36,14 +37,14 @@ impl InvertedIndexReader {
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
schema: Schema,
|
||||
record_option: IndexRecordOption,
|
||||
) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
delete_bitset,
|
||||
schema,
|
||||
record_option
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,15 +87,19 @@ impl InvertedIndexReader {
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let has_freq = option.has_freq();
|
||||
let freq_reading_option = match (self.record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq
|
||||
};
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
has_freq,
|
||||
freq_reading_option
|
||||
)
|
||||
}
|
||||
|
||||
@@ -135,11 +140,8 @@ impl InvertedIndexReader {
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
let maximum_option = get!(field_entry.field_type().get_index_record_option());
|
||||
let best_effort_option = cmp::min(maximum_option, option);
|
||||
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
|
||||
@@ -37,6 +37,8 @@ use fastfield::MultiValueIntFastFieldReader;
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
@@ -223,9 +225,14 @@ impl SegmentReader {
|
||||
return Arc::clone(inv_idx_reader);
|
||||
}
|
||||
|
||||
|
||||
let record_option = self.schema.get_field_entry(field).field_type()
|
||||
.get_index_record_option()
|
||||
.expect("Field does not seem indexed.");
|
||||
|
||||
let termdict_source: ReadOnlySource = self.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field term dictionary in composite file.");
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let postings_source = self.postings_composite
|
||||
.open_read(field)
|
||||
@@ -240,7 +247,7 @@ impl SegmentReader {
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
self.schema.clone(),
|
||||
record_option
|
||||
));
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
|
||||
@@ -37,6 +37,12 @@ pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = usize;
|
||||
|
||||
pub(crate) enum FreqReadingOption {
|
||||
NoFreq,
|
||||
SkipFreq,
|
||||
ReadFreq
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -4,9 +4,11 @@ use common::BitSet;
|
||||
use postings::{DocSet, HasLen, Postings, SkipResult};
|
||||
use std::cmp;
|
||||
use fst::Streamer;
|
||||
use compression::compute_block_size;
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::cell::UnsafeCell;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
|
||||
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
|
||||
|
||||
@@ -299,7 +301,7 @@ impl Postings for SegmentPostings {
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
has_freq: bool,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
@@ -312,7 +314,7 @@ impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: usize,
|
||||
data: SourceRead,
|
||||
has_freq: bool,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_bitpacked_blocks;
|
||||
@@ -321,7 +323,7 @@ impl BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
has_freq,
|
||||
freq_reading_option,
|
||||
remaining_data: data,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
@@ -401,11 +403,16 @@ impl BlockSegmentPostings {
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
|
||||
if self.has_freq {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compute_block_size(self.remaining_data.as_ref()[0]);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
@@ -418,9 +425,12 @@ impl BlockSegmentPostings {
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
if self.has_freq {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
@@ -437,7 +447,7 @@ impl BlockSegmentPostings {
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
has_freq: false,
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
remaining_data: From::from(ReadOnlySource::empty()),
|
||||
doc_offset: 0,
|
||||
|
||||
Reference in New Issue
Block a user