encode some part of posting list as -1 instead of direct values (#2185)

* add support for delta-1 encoding posting list

* encode term frequency minus one

* don't emit tf for json integer terms

* make skipreader not pub(crate) mutable
This commit is contained in:
trinity-1686a
2023-10-20 16:58:26 +02:00
committed by GitHub
parent c2b0469180
commit 0d4589219b
9 changed files with 240 additions and 65 deletions

View File

@@ -37,7 +37,7 @@ uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
bitpacking = { git = "https://github.com/quickwit-oss/bitpacking", rev = "f730b75", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"

View File

@@ -109,6 +109,35 @@ mod tests_mmap {
}
}
#[test]
fn test_json_field_number() {
// this test was added specifically to reach some cases related to using json fields, with
// frequency enabled, to store integers, with enough documents containing a single integer
// that the posting list can be bitpacked.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..256 {
let json = serde_json::json!({"somekey": 1u64, "otherkey": -2i64});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"somekey": "1str", "otherkey": "2str"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 512);
let parse_query = QueryParser::for_index(&index, Vec::new());
{
let query = parse_query.parse_query(r"json.somekey:1").unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 256);
}
}
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();

View File

@@ -92,7 +92,7 @@ impl PositionReader {
// that block is bitpacked.
let bit_width = bit_widths[block_rel_id];
self.block_decoder
.uncompress_block_unsorted(compressed_data, bit_width);
.uncompress_block_unsorted(compressed_data, bit_width, false);
} else {
// that block is vint encoded.
self.block_decoder

View File

@@ -62,8 +62,9 @@ impl<W: io::Write> PositionSerializer<W> {
return;
}
if self.block.len() == COMPRESSION_BLOCK_SIZE {
let (bit_width, block_encoded): (u8, &[u8]) =
self.block_encoder.compress_block_unsorted(&self.block[..]);
let (bit_width, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(&self.block[..], false);
self.bit_widths.push(bit_width);
self.positions_buffer.extend(block_encoded);
} else {

View File

@@ -24,13 +24,13 @@ fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
#[derive(Clone)]
pub struct BlockSegmentPostings {
pub(crate) doc_decoder: BlockDecoder,
loaded_offset: usize,
block_loaded: bool,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
block_max_score_cache: Option<Score>,
doc_freq: u32,
data: OwnedBytes,
pub(crate) skip_reader: SkipReader,
skip_reader: SkipReader,
}
fn decode_bitpacked_block(
@@ -40,10 +40,16 @@ fn decode_bitpacked_block(
doc_offset: DocId,
doc_num_bits: u8,
tf_num_bits: u8,
strict_delta: bool,
) {
let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
let num_consumed_bytes =
doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits, strict_delta);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
freq_decoder.uncompress_block_unsorted(
&data[num_consumed_bytes..],
tf_num_bits,
strict_delta,
);
}
}
@@ -57,11 +63,15 @@ fn decode_vint_block(
let num_consumed_bytes =
doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_vint_unsorted(
&data[num_consumed_bytes..],
num_vint_docs,
TERMINATED,
);
// if it's a json term with freq, containing less than 256 docs, we can reach here thinking
// we have a freq, despite not really having one.
if data.len() > num_consumed_bytes {
freq_decoder.uncompress_vint_unsorted(
&data[num_consumed_bytes..],
num_vint_docs,
TERMINATED,
);
}
}
}
@@ -78,28 +88,46 @@ fn split_into_skips_and_postings(
}
impl BlockSegmentPostings {
/// Opens a `BlockSegmentPostings`.
/// `doc_freq` is the number of documents in the posting list.
/// `record_option` represents the amount of data available according to the schema.
/// `requested_option` is the amount of data requested by the user.
/// If for instance, we do not request for term frequencies, this function will not decompress
/// term frequency blocks.
pub(crate) fn open(
doc_freq: u32,
data: FileSlice,
record_option: IndexRecordOption,
mut record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => {
let block_count = doc_freq as usize / COMPRESSION_BLOCK_SIZE;
// 8 is the minimum size of a block with frequency (can be more if pos are stored
// too)
if skip_data.len() < 8 * block_count {
// the field might be encoded with frequency, but this term in particular isn't.
// This can happen for JSON field with term frequencies:
// - text terms are encoded with term freqs.
// - numerical terms are encoded without term freqs.
record_option = IndexRecordOption::Basic;
}
SkipReader::new(skip_data, doc_freq, record_option)
}
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let mut block_segment_postings = BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
loaded_offset: usize::MAX,
block_loaded: false,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
block_max_score_cache: None,
@@ -169,7 +197,7 @@ impl BlockSegmentPostings {
split_into_skips_and_postings(doc_freq, postings_data)?;
self.data = postings_data;
self.block_max_score_cache = None;
self.loaded_offset = usize::MAX;
self.block_loaded = false;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
@@ -265,22 +293,23 @@ impl BlockSegmentPostings {
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;
}
}
pub(crate) fn block_is_loaded(&self) -> bool {
self.loaded_offset == self.skip_reader.byte_offset()
self.block_loaded
}
pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset();
if self.loaded_offset == offset {
if self.block_is_loaded() {
return;
}
self.loaded_offset = offset;
match self.skip_reader.block_info() {
BlockInfo::BitPacked {
doc_num_bits,
strict_delta_encoded,
tf_num_bits,
..
} => {
@@ -295,6 +324,7 @@ impl BlockSegmentPostings {
self.skip_reader.last_doc_in_previous_block,
doc_num_bits,
tf_num_bits,
strict_delta_encoded,
);
}
BlockInfo::VInt { num_docs } => {
@@ -318,13 +348,13 @@ impl BlockSegmentPostings {
);
}
}
self.block_loaded = true;
}
/// Advance to the next block.
///
/// Returns false if and only if there is no remaining block.
pub fn advance(&mut self) {
self.skip_reader.advance();
self.block_loaded = false;
self.block_max_score_cache = None;
self.load_block();
}
@@ -333,7 +363,7 @@ impl BlockSegmentPostings {
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
loaded_offset: 0,
block_loaded: true,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
@@ -342,6 +372,10 @@ impl BlockSegmentPostings {
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
}
}
pub(crate) fn skip_reader(&self) -> &SkipReader {
&self.skip_reader
}
}
#[cfg(test)]

View File

@@ -33,14 +33,40 @@ impl BlockEncoder {
}
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
// if offset is zero, convert it to None. This is correct as long as we do the same when
// decompressing. It's required in case the block starts with an actual zero.
let offset = if offset == 0u32 { None } else { Some(offset) };
let num_bits = self.bitpacker.num_bits_strictly_sorted(offset, block);
let written_size =
self.bitpacker
.compress_sorted(offset, block, &mut self.output[..], num_bits);
.compress_strictly_sorted(offset, block, &mut self.output[..], num_bits);
(num_bits, &self.output[..written_size])
}
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
/// Compress a single block of unsorted numbers.
///
/// If `minus_one_encoded` is set, each value must be >= 1, and will be encoded in a sligly
/// more compact format. This is useful for some values where 0 isn't a correct value, such
/// as term frequency, but isn't correct for some usages like position lists, where 0 can
/// appear.
pub fn compress_block_unsorted(
&mut self,
block: &[u32],
minus_one_encoded: bool,
) -> (u8, &[u8]) {
debug_assert!(!minus_one_encoded || !block.contains(&0));
let mut block_minus_one = [0; COMPRESSION_BLOCK_SIZE];
let block = if minus_one_encoded {
for (elem_min_one, elem) in block_minus_one.iter_mut().zip(block) {
*elem_min_one = elem - 1;
}
&block_minus_one
} else {
block
};
let num_bits = self.bitpacker.num_bits(block);
let written_size = self
.bitpacker
@@ -71,21 +97,55 @@ impl BlockDecoder {
}
}
/// Decompress block of sorted integers.
///
/// `strict_delta` depends on what encoding was used. Older version of tantivy never use strict
/// deltas, newer versions always use them.
pub fn uncompress_block_sorted(
&mut self,
compressed_data: &[u8],
offset: u32,
num_bits: u8,
strict_delta: bool,
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
if strict_delta {
let offset = std::num::NonZeroU32::new(offset).map(std::num::NonZeroU32::get);
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker.decompress_strictly_sorted(
offset,
compressed_data,
&mut self.output,
num_bits,
)
} else {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
/// Decompress block of unsorted integers.
///
/// `minus_one_encoded` depends on what encoding was used. Older version of tantivy never use
/// that encoding. Newer version use it for some structures, but not all. See the corresponding
/// call to `BlockEncoder::compress_block_unsorted`.
pub fn uncompress_block_unsorted(
&mut self,
compressed_data: &[u8],
num_bits: u8,
minus_one_encoded: bool,
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(compressed_data, &mut self.output, num_bits)
let res = self
.bitpacker
.decompress(compressed_data, &mut self.output, num_bits);
if minus_one_encoded {
for val in &mut self.output {
*val += 1;
}
}
res
}
#[inline]
@@ -218,7 +278,8 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
let consumed_num_bytes =
decoder.uncompress_block_sorted(compressed_data, 0, num_bits, true);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
@@ -233,7 +294,8 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
let consumed_num_bytes =
decoder.uncompress_block_sorted(compressed_data, 10, num_bits, true);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
@@ -252,7 +314,8 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
let consumed_num_bytes =
decoder.uncompress_block_sorted(&compressed, 10, num_bits, true);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
@@ -263,21 +326,25 @@ pub mod tests {
#[test]
fn test_encode_unsorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::default();
let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
for minus_one_encode in [false, true] {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::default();
let (num_bits, compressed_data) =
encoder.compress_block_unsorted(&vals, minus_one_encode);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes =
decoder.uncompress_block_unsorted(&compressed, num_bits, minus_one_encode);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
}
}
@@ -344,7 +411,7 @@ mod bench {
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
decoder.uncompress_block_sorted(compressed, 0u32, num_bits, true);
});
}

View File

@@ -301,6 +301,7 @@ pub struct PostingsSerializer<W: Write> {
bm25_weight: Option<Bm25Weight>,
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
* this value is used to compute the block wand information. */
term_has_freq: bool,
}
impl<W: Write> PostingsSerializer<W> {
@@ -325,13 +326,15 @@ impl<W: Write> PostingsSerializer<W> {
fieldnorm_reader,
bm25_weight: None,
avg_fieldnorm,
term_has_freq: false,
}
}
pub fn new_term(&mut self, term_doc_freq: u32) {
self.bm25_weight = None;
if !self.mode.has_freq() {
self.term_has_freq = self.mode.has_freq() && term_doc_freq != 0;
if !self.term_has_freq {
return;
}
@@ -365,10 +368,10 @@ impl<W: Write> PostingsSerializer<W> {
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.mode.has_freq() {
if self.term_has_freq {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(self.block.term_freqs());
.compress_block_unsorted(self.block.term_freqs(), true);
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
@@ -432,7 +435,7 @@ impl<W: Write> PostingsSerializer<W> {
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.mode.has_freq() {
if self.term_has_freq {
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());

View File

@@ -6,6 +6,22 @@ use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
// doc num bits uses the following encoding:
// given 0b a b cdefgh
// |1|2| 3 |
// - 1: unused
// - 2: is delta-1 encoded. 0 if not, 1, if yes
// - 3: a 6 bit number in 0..=32, the actual bitwidth
fn encode_bitwidth(bitwidth: u8, delta_1: bool) -> u8 {
bitwidth | ((delta_1 as u8) << 6)
}
fn decode_bitwidth(raw_bitwidth: u8) -> (u8, bool) {
let delta_1 = (raw_bitwidth >> 6 & 1) != 0;
let bitwidth = raw_bitwidth & 0x3f;
(bitwidth, delta_1)
}
#[inline]
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
max_tf.min(u8::MAX as u32) as u8
@@ -41,7 +57,7 @@ impl SkipSerializer {
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
write_u32(last_doc, &mut self.buffer);
self.buffer.push(doc_num_bits);
self.buffer.push(encode_bitwidth(doc_num_bits, true));
}
pub fn write_term_freq(&mut self, tf_num_bits: u8) {
@@ -85,6 +101,7 @@ pub(crate) struct SkipReader {
pub(crate) enum BlockInfo {
BitPacked {
doc_num_bits: u8,
strict_delta_encoded: bool,
tf_num_bits: u8,
tf_sum: u32,
block_wand_fieldnorm_id: u8,
@@ -172,12 +189,13 @@ impl SkipReader {
let bytes = self.owned_read.as_slice();
let advance_len: usize;
self.last_doc_in_block = read_u32(bytes);
let doc_num_bits = bytes[4];
let (doc_num_bits, strict_delta_encoded) = decode_bitwidth(bytes[4]);
match self.skip_info {
IndexRecordOption::Basic => {
advance_len = 5;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
strict_delta_encoded,
tf_num_bits: 0,
tf_sum: 0,
block_wand_fieldnorm_id: 0,
@@ -191,6 +209,7 @@ impl SkipReader {
advance_len = 8;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
strict_delta_encoded,
tf_num_bits,
tf_sum: 0,
block_wand_fieldnorm_id,
@@ -205,6 +224,7 @@ impl SkipReader {
advance_len = 12;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
strict_delta_encoded,
tf_num_bits,
tf_sum,
block_wand_fieldnorm_id,
@@ -268,7 +288,9 @@ impl SkipReader {
#[cfg(test)]
mod tests {
use super::{BlockInfo, IndexRecordOption, SkipReader, SkipSerializer};
use super::{
decode_bitwidth, encode_bitwidth, BlockInfo, IndexRecordOption, SkipReader, SkipSerializer,
};
use crate::directory::OwnedBytes;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -310,6 +332,7 @@ mod tests {
skip_reader.block_info,
BlockInfo::BitPacked {
doc_num_bits: 2u8,
strict_delta_encoded: true,
tf_num_bits: 3u8,
tf_sum: 0,
block_wand_fieldnorm_id: 13,
@@ -322,6 +345,7 @@ mod tests {
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 5u8,
strict_delta_encoded: true,
tf_num_bits: 2u8,
tf_sum: 0,
block_wand_fieldnorm_id: 8,
@@ -352,6 +376,7 @@ mod tests {
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 2u8,
strict_delta_encoded: true,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
@@ -364,6 +389,7 @@ mod tests {
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 5u8,
strict_delta_encoded: true,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
@@ -393,6 +419,7 @@ mod tests {
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 2u8,
strict_delta_encoded: true,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
@@ -402,4 +429,18 @@ mod tests {
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
}
#[test]
fn test_encode_decode_bitwidth() {
for bitwidth in 0..=32 {
for delta_1 in [false, true] {
assert_eq!(
(bitwidth, delta_1),
decode_bitwidth(encode_bitwidth(bitwidth, delta_1))
);
}
}
assert_eq!(0b01000010, encode_bitwidth(0b10, true));
assert_eq!(0b00000010, encode_bitwidth(0b10, false));
}
}

View File

@@ -93,7 +93,7 @@ impl TermScorer {
}
pub fn last_doc_in_block(&self) -> DocId {
self.postings.block_cursor.skip_reader.last_doc_in_block()
self.postings.block_cursor.skip_reader().last_doc_in_block()
}
}