mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
encode some part of posting list as -1 instead of direct values (#2185)
* add support for delta-1 encoding posting list * encode term frequency minus one * don't emit tf for json integer terms * make skipreader not pub(crate) mutable
This commit is contained in:
@@ -37,7 +37,7 @@ uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
bitpacking = { git = "https://github.com/quickwit-oss/bitpacking", rev = "f730b75", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4.0"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
|
||||
@@ -109,6 +109,35 @@ mod tests_mmap {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_number() {
|
||||
// this test was added specifically to reach some cases related to using json fields, with
|
||||
// frequency enabled, to store integers, with enough documents containing a single integer
|
||||
// that the posting list can be bitpacked.
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for _ in 0..256 {
|
||||
let json = serde_json::json!({"somekey": 1u64, "otherkey": -2i64});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
|
||||
let json = serde_json::json!({"somekey": "1str", "otherkey": "2str"});
|
||||
index_writer.add_document(doc!(json_field=>json)).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 512);
|
||||
let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
{
|
||||
let query = parse_query.parse_query(r"json.somekey:1").unwrap();
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 256);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -92,7 +92,7 @@ impl PositionReader {
|
||||
// that block is bitpacked.
|
||||
let bit_width = bit_widths[block_rel_id];
|
||||
self.block_decoder
|
||||
.uncompress_block_unsorted(compressed_data, bit_width);
|
||||
.uncompress_block_unsorted(compressed_data, bit_width, false);
|
||||
} else {
|
||||
// that block is vint encoded.
|
||||
self.block_decoder
|
||||
|
||||
@@ -62,8 +62,9 @@ impl<W: io::Write> PositionSerializer<W> {
|
||||
return;
|
||||
}
|
||||
if self.block.len() == COMPRESSION_BLOCK_SIZE {
|
||||
let (bit_width, block_encoded): (u8, &[u8]) =
|
||||
self.block_encoder.compress_block_unsorted(&self.block[..]);
|
||||
let (bit_width, block_encoded): (u8, &[u8]) = self
|
||||
.block_encoder
|
||||
.compress_block_unsorted(&self.block[..], false);
|
||||
self.bit_widths.push(bit_width);
|
||||
self.positions_buffer.extend(block_encoded);
|
||||
} else {
|
||||
|
||||
@@ -24,13 +24,13 @@ fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
|
||||
#[derive(Clone)]
|
||||
pub struct BlockSegmentPostings {
|
||||
pub(crate) doc_decoder: BlockDecoder,
|
||||
loaded_offset: usize,
|
||||
block_loaded: bool,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
block_max_score_cache: Option<Score>,
|
||||
doc_freq: u32,
|
||||
data: OwnedBytes,
|
||||
pub(crate) skip_reader: SkipReader,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn decode_bitpacked_block(
|
||||
@@ -40,10 +40,16 @@ fn decode_bitpacked_block(
|
||||
doc_offset: DocId,
|
||||
doc_num_bits: u8,
|
||||
tf_num_bits: u8,
|
||||
strict_delta: bool,
|
||||
) {
|
||||
let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
|
||||
let num_consumed_bytes =
|
||||
doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits, strict_delta);
|
||||
if let Some(freq_decoder) = freq_decoder_opt {
|
||||
freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
|
||||
freq_decoder.uncompress_block_unsorted(
|
||||
&data[num_consumed_bytes..],
|
||||
tf_num_bits,
|
||||
strict_delta,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,11 +63,15 @@ fn decode_vint_block(
|
||||
let num_consumed_bytes =
|
||||
doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED);
|
||||
if let Some(freq_decoder) = freq_decoder_opt {
|
||||
freq_decoder.uncompress_vint_unsorted(
|
||||
&data[num_consumed_bytes..],
|
||||
num_vint_docs,
|
||||
TERMINATED,
|
||||
);
|
||||
// if it's a json term with freq, containing less than 256 docs, we can reach here thinking
|
||||
// we have a freq, despite not really having one.
|
||||
if data.len() > num_consumed_bytes {
|
||||
freq_decoder.uncompress_vint_unsorted(
|
||||
&data[num_consumed_bytes..],
|
||||
num_vint_docs,
|
||||
TERMINATED,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,28 +88,46 @@ fn split_into_skips_and_postings(
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
/// Opens a `BlockSegmentPostings`.
|
||||
/// `doc_freq` is the number of documents in the posting list.
|
||||
/// `record_option` represents the amount of data available according to the schema.
|
||||
/// `requested_option` is the amount of data requested by the user.
|
||||
/// If for instance, we do not request for term frequencies, this function will not decompress
|
||||
/// term frequency blocks.
|
||||
pub(crate) fn open(
|
||||
doc_freq: u32,
|
||||
data: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
mut record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> io::Result<BlockSegmentPostings> {
|
||||
let bytes = data.read_bytes()?;
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => {
|
||||
let block_count = doc_freq as usize / COMPRESSION_BLOCK_SIZE;
|
||||
// 8 is the minimum size of a block with frequency (can be more if pos are stored
|
||||
// too)
|
||||
if skip_data.len() < 8 * block_count {
|
||||
// the field might be encoded with frequency, but this term in particular isn't.
|
||||
// This can happen for JSON field with term frequencies:
|
||||
// - text terms are encoded with term freqs.
|
||||
// - numerical terms are encoded without term freqs.
|
||||
record_option = IndexRecordOption::Basic;
|
||||
}
|
||||
SkipReader::new(skip_data, doc_freq, record_option)
|
||||
}
|
||||
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
|
||||
};
|
||||
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let bytes = data.read_bytes()?;
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
|
||||
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
|
||||
};
|
||||
|
||||
let mut block_segment_postings = BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
loaded_offset: usize::MAX,
|
||||
block_loaded: false,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
block_max_score_cache: None,
|
||||
@@ -169,7 +197,7 @@ impl BlockSegmentPostings {
|
||||
split_into_skips_and_postings(doc_freq, postings_data)?;
|
||||
self.data = postings_data;
|
||||
self.block_max_score_cache = None;
|
||||
self.loaded_offset = usize::MAX;
|
||||
self.block_loaded = false;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data, doc_freq);
|
||||
} else {
|
||||
@@ -265,22 +293,23 @@ impl BlockSegmentPostings {
|
||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn block_is_loaded(&self) -> bool {
|
||||
self.loaded_offset == self.skip_reader.byte_offset()
|
||||
self.block_loaded
|
||||
}
|
||||
|
||||
pub(crate) fn load_block(&mut self) {
|
||||
let offset = self.skip_reader.byte_offset();
|
||||
if self.loaded_offset == offset {
|
||||
if self.block_is_loaded() {
|
||||
return;
|
||||
}
|
||||
self.loaded_offset = offset;
|
||||
match self.skip_reader.block_info() {
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
strict_delta_encoded,
|
||||
tf_num_bits,
|
||||
..
|
||||
} => {
|
||||
@@ -295,6 +324,7 @@ impl BlockSegmentPostings {
|
||||
self.skip_reader.last_doc_in_previous_block,
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
strict_delta_encoded,
|
||||
);
|
||||
}
|
||||
BlockInfo::VInt { num_docs } => {
|
||||
@@ -318,13 +348,13 @@ impl BlockSegmentPostings {
|
||||
);
|
||||
}
|
||||
}
|
||||
self.block_loaded = true;
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false if and only if there is no remaining block.
|
||||
pub fn advance(&mut self) {
|
||||
self.skip_reader.advance();
|
||||
self.block_loaded = false;
|
||||
self.block_max_score_cache = None;
|
||||
self.load_block();
|
||||
}
|
||||
@@ -333,7 +363,7 @@ impl BlockSegmentPostings {
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder::with_val(TERMINATED),
|
||||
loaded_offset: 0,
|
||||
block_loaded: true,
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
block_max_score_cache: None,
|
||||
@@ -342,6 +372,10 @@ impl BlockSegmentPostings {
|
||||
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn skip_reader(&self) -> &SkipReader {
|
||||
&self.skip_reader
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -33,14 +33,40 @@ impl BlockEncoder {
|
||||
}
|
||||
|
||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
// if offset is zero, convert it to None. This is correct as long as we do the same when
|
||||
// decompressing. It's required in case the block starts with an actual zero.
|
||||
let offset = if offset == 0u32 { None } else { Some(offset) };
|
||||
|
||||
let num_bits = self.bitpacker.num_bits_strictly_sorted(offset, block);
|
||||
let written_size =
|
||||
self.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[..], num_bits);
|
||||
.compress_strictly_sorted(offset, block, &mut self.output[..], num_bits);
|
||||
(num_bits, &self.output[..written_size])
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
|
||||
/// Compress a single block of unsorted numbers.
|
||||
///
|
||||
/// If `minus_one_encoded` is set, each value must be >= 1, and will be encoded in a sligly
|
||||
/// more compact format. This is useful for some values where 0 isn't a correct value, such
|
||||
/// as term frequency, but isn't correct for some usages like position lists, where 0 can
|
||||
/// appear.
|
||||
pub fn compress_block_unsorted(
|
||||
&mut self,
|
||||
block: &[u32],
|
||||
minus_one_encoded: bool,
|
||||
) -> (u8, &[u8]) {
|
||||
debug_assert!(!minus_one_encoded || !block.contains(&0));
|
||||
|
||||
let mut block_minus_one = [0; COMPRESSION_BLOCK_SIZE];
|
||||
let block = if minus_one_encoded {
|
||||
for (elem_min_one, elem) in block_minus_one.iter_mut().zip(block) {
|
||||
*elem_min_one = elem - 1;
|
||||
}
|
||||
&block_minus_one
|
||||
} else {
|
||||
block
|
||||
};
|
||||
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
let written_size = self
|
||||
.bitpacker
|
||||
@@ -71,21 +97,55 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Decompress block of sorted integers.
|
||||
///
|
||||
/// `strict_delta` depends on what encoding was used. Older version of tantivy never use strict
|
||||
/// deltas, newer versions always use them.
|
||||
pub fn uncompress_block_sorted(
|
||||
&mut self,
|
||||
compressed_data: &[u8],
|
||||
offset: u32,
|
||||
num_bits: u8,
|
||||
strict_delta: bool,
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
|
||||
if strict_delta {
|
||||
let offset = std::num::NonZeroU32::new(offset).map(std::num::NonZeroU32::get);
|
||||
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker.decompress_strictly_sorted(
|
||||
offset,
|
||||
compressed_data,
|
||||
&mut self.output,
|
||||
num_bits,
|
||||
)
|
||||
} else {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
/// Decompress block of unsorted integers.
|
||||
///
|
||||
/// `minus_one_encoded` depends on what encoding was used. Older version of tantivy never use
|
||||
/// that encoding. Newer version use it for some structures, but not all. See the corresponding
|
||||
/// call to `BlockEncoder::compress_block_unsorted`.
|
||||
pub fn uncompress_block_unsorted(
|
||||
&mut self,
|
||||
compressed_data: &[u8],
|
||||
num_bits: u8,
|
||||
minus_one_encoded: bool,
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress(compressed_data, &mut self.output, num_bits)
|
||||
let res = self
|
||||
.bitpacker
|
||||
.decompress(compressed_data, &mut self.output, num_bits);
|
||||
if minus_one_encoded {
|
||||
for val in &mut self.output {
|
||||
*val += 1;
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -218,7 +278,8 @@ pub mod tests {
|
||||
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_block_sorted(compressed_data, 0, num_bits, true);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
@@ -233,7 +294,8 @@ pub mod tests {
|
||||
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_block_sorted(compressed_data, 10, num_bits, true);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
@@ -252,7 +314,8 @@ pub mod tests {
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_block_sorted(&compressed, 10, num_bits, true);
|
||||
assert_eq!(consumed_num_bytes, compressed.len() - 1);
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
@@ -263,21 +326,25 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_encode_unsorted_block_with_junk() {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
|
||||
let mut encoder = BlockEncoder::default();
|
||||
let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
|
||||
assert_eq!(consumed_num_bytes + 1, compressed.len());
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
for minus_one_encode in [false, true] {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
|
||||
let mut encoder = BlockEncoder::default();
|
||||
let (num_bits, compressed_data) =
|
||||
encoder.compress_block_unsorted(&vals, minus_one_encode);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
{
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_block_unsorted(&compressed, num_bits, minus_one_encode);
|
||||
assert_eq!(consumed_num_bytes + 1, compressed.len());
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -344,7 +411,7 @@ mod bench {
|
||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::default();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits, true);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -301,6 +301,7 @@ pub struct PostingsSerializer<W: Write> {
|
||||
bm25_weight: Option<Bm25Weight>,
|
||||
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
|
||||
* this value is used to compute the block wand information. */
|
||||
term_has_freq: bool,
|
||||
}
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
@@ -325,13 +326,15 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
fieldnorm_reader,
|
||||
bm25_weight: None,
|
||||
avg_fieldnorm,
|
||||
term_has_freq: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_term(&mut self, term_doc_freq: u32) {
|
||||
self.bm25_weight = None;
|
||||
|
||||
if !self.mode.has_freq() {
|
||||
self.term_has_freq = self.mode.has_freq() && term_doc_freq != 0;
|
||||
if !self.term_has_freq {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -365,10 +368,10 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// last el block 0, offset block 1,
|
||||
self.postings_write.extend(block_encoded);
|
||||
}
|
||||
if self.mode.has_freq() {
|
||||
if self.term_has_freq {
|
||||
let (num_bits, block_encoded): (u8, &[u8]) = self
|
||||
.block_encoder
|
||||
.compress_block_unsorted(self.block.term_freqs());
|
||||
.compress_block_unsorted(self.block.term_freqs(), true);
|
||||
self.postings_write.extend(block_encoded);
|
||||
self.skip_write.write_term_freq(num_bits);
|
||||
if self.mode.has_positions() {
|
||||
@@ -432,7 +435,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.mode.has_freq() {
|
||||
if self.term_has_freq {
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
.compress_vint_unsorted(self.block.term_freqs());
|
||||
|
||||
@@ -6,6 +6,22 @@ use crate::query::Bm25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
// doc num bits uses the following encoding:
|
||||
// given 0b a b cdefgh
|
||||
// |1|2| 3 |
|
||||
// - 1: unused
|
||||
// - 2: is delta-1 encoded. 0 if not, 1, if yes
|
||||
// - 3: a 6 bit number in 0..=32, the actual bitwidth
|
||||
fn encode_bitwidth(bitwidth: u8, delta_1: bool) -> u8 {
|
||||
bitwidth | ((delta_1 as u8) << 6)
|
||||
}
|
||||
|
||||
fn decode_bitwidth(raw_bitwidth: u8) -> (u8, bool) {
|
||||
let delta_1 = (raw_bitwidth >> 6 & 1) != 0;
|
||||
let bitwidth = raw_bitwidth & 0x3f;
|
||||
(bitwidth, delta_1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
|
||||
max_tf.min(u8::MAX as u32) as u8
|
||||
@@ -41,7 +57,7 @@ impl SkipSerializer {
|
||||
|
||||
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
|
||||
write_u32(last_doc, &mut self.buffer);
|
||||
self.buffer.push(doc_num_bits);
|
||||
self.buffer.push(encode_bitwidth(doc_num_bits, true));
|
||||
}
|
||||
|
||||
pub fn write_term_freq(&mut self, tf_num_bits: u8) {
|
||||
@@ -85,6 +101,7 @@ pub(crate) struct SkipReader {
|
||||
pub(crate) enum BlockInfo {
|
||||
BitPacked {
|
||||
doc_num_bits: u8,
|
||||
strict_delta_encoded: bool,
|
||||
tf_num_bits: u8,
|
||||
tf_sum: u32,
|
||||
block_wand_fieldnorm_id: u8,
|
||||
@@ -172,12 +189,13 @@ impl SkipReader {
|
||||
let bytes = self.owned_read.as_slice();
|
||||
let advance_len: usize;
|
||||
self.last_doc_in_block = read_u32(bytes);
|
||||
let doc_num_bits = bytes[4];
|
||||
let (doc_num_bits, strict_delta_encoded) = decode_bitwidth(bytes[4]);
|
||||
match self.skip_info {
|
||||
IndexRecordOption::Basic => {
|
||||
advance_len = 5;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
strict_delta_encoded,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0,
|
||||
block_wand_fieldnorm_id: 0,
|
||||
@@ -191,6 +209,7 @@ impl SkipReader {
|
||||
advance_len = 8;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
strict_delta_encoded,
|
||||
tf_num_bits,
|
||||
tf_sum: 0,
|
||||
block_wand_fieldnorm_id,
|
||||
@@ -205,6 +224,7 @@ impl SkipReader {
|
||||
advance_len = 12;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
strict_delta_encoded,
|
||||
tf_num_bits,
|
||||
tf_sum,
|
||||
block_wand_fieldnorm_id,
|
||||
@@ -268,7 +288,9 @@ impl SkipReader {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{BlockInfo, IndexRecordOption, SkipReader, SkipSerializer};
|
||||
use super::{
|
||||
decode_bitwidth, encode_bitwidth, BlockInfo, IndexRecordOption, SkipReader, SkipSerializer,
|
||||
};
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
@@ -310,6 +332,7 @@ mod tests {
|
||||
skip_reader.block_info,
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
strict_delta_encoded: true,
|
||||
tf_num_bits: 3u8,
|
||||
tf_sum: 0,
|
||||
block_wand_fieldnorm_id: 13,
|
||||
@@ -322,6 +345,7 @@ mod tests {
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 5u8,
|
||||
strict_delta_encoded: true,
|
||||
tf_num_bits: 2u8,
|
||||
tf_sum: 0,
|
||||
block_wand_fieldnorm_id: 8,
|
||||
@@ -352,6 +376,7 @@ mod tests {
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
strict_delta_encoded: true,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32,
|
||||
block_wand_fieldnorm_id: 0,
|
||||
@@ -364,6 +389,7 @@ mod tests {
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 5u8,
|
||||
strict_delta_encoded: true,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32,
|
||||
block_wand_fieldnorm_id: 0,
|
||||
@@ -393,6 +419,7 @@ mod tests {
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits: 2u8,
|
||||
strict_delta_encoded: true,
|
||||
tf_num_bits: 0,
|
||||
tf_sum: 0u32,
|
||||
block_wand_fieldnorm_id: 0,
|
||||
@@ -402,4 +429,18 @@ mod tests {
|
||||
skip_reader.advance();
|
||||
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode_bitwidth() {
|
||||
for bitwidth in 0..=32 {
|
||||
for delta_1 in [false, true] {
|
||||
assert_eq!(
|
||||
(bitwidth, delta_1),
|
||||
decode_bitwidth(encode_bitwidth(bitwidth, delta_1))
|
||||
);
|
||||
}
|
||||
}
|
||||
assert_eq!(0b01000010, encode_bitwidth(0b10, true));
|
||||
assert_eq!(0b00000010, encode_bitwidth(0b10, false));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ impl TermScorer {
|
||||
}
|
||||
|
||||
pub fn last_doc_in_block(&self) -> DocId {
|
||||
self.postings.block_cursor.skip_reader.last_doc_in_block()
|
||||
self.postings.block_cursor.skip_reader().last_doc_in_block()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user