mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 09:32:54 +00:00
Encode blockwand on a single byte.
This commit is contained in:
@@ -9,6 +9,10 @@ Tantivy 0.14.0
|
||||
- Bugfix in `Query::explain`
|
||||
- Removed dependency on `notify` #924. Replaced with `FileWatcher` struct that polls meta file every 500ms in background thread. (@halvorboe @guilload)
|
||||
- Added `FilterCollector`, which wraps another collector and filters docs using a predicate over a fast field (@barrotsteindev)
|
||||
- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@pmasurel)
|
||||
|
||||
This version breaks compatibility and requires users to reindex everything.
|
||||
|
||||
|
||||
Tantivy 0.13.2
|
||||
===================
|
||||
|
||||
@@ -115,6 +115,18 @@ impl Footer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V3 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
|
||||
library_version: library_version.clone(),
|
||||
index_version: self.version.clone(),
|
||||
@@ -136,24 +148,31 @@ pub enum VersionedFooter {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Block wand max termfred on 1 byte
|
||||
V3 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl BinarySerializable for VersionedFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = Vec::new();
|
||||
match self {
|
||||
VersionedFooter::V2 {
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
// Serializes a valid `VersionedFooter` or panics if the version is unknown
|
||||
// [ version | crc_hash | compression_mode ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
BinarySerializable::serialize(&2u32, &mut buf)?;
|
||||
BinarySerializable::serialize(&3u32, &mut buf)?;
|
||||
BinarySerializable::serialize(crc32, &mut buf)?;
|
||||
BinarySerializable::serialize(compression, &mut buf)?;
|
||||
}
|
||||
VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => {
|
||||
VersionedFooter::V2 { .. }
|
||||
| VersionedFooter::V1 { .. }
|
||||
| VersionedFooter::UnknownVersion => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Cannot serialize an unknown versioned footer ",
|
||||
@@ -182,7 +201,7 @@ impl BinarySerializable for VersionedFooter {
|
||||
reader.read_exact(&mut buf[..])?;
|
||||
let mut cursor = &buf[..];
|
||||
let version = u32::deserialize(&mut cursor)?;
|
||||
if version != 1 && version != 2 {
|
||||
if version > 3 {
|
||||
return Ok(VersionedFooter::UnknownVersion);
|
||||
}
|
||||
let crc32 = u32::deserialize(&mut cursor)?;
|
||||
@@ -192,12 +211,17 @@ impl BinarySerializable for VersionedFooter {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else {
|
||||
assert_eq!(version, 2);
|
||||
} else if version == 2 {
|
||||
VersionedFooter::V2 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else {
|
||||
assert_eq!(version, 3);
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -205,6 +229,7 @@ impl BinarySerializable for VersionedFooter {
|
||||
impl VersionedFooter {
|
||||
pub fn crc(&self) -> Option<CrcHashU32> {
|
||||
match self {
|
||||
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::UnknownVersion { .. } => None,
|
||||
@@ -243,7 +268,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
|
||||
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
let crc32 = self.hasher.take().unwrap().finalize();
|
||||
let footer = Footer::new(VersionedFooter::V2 {
|
||||
let footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
@@ -278,7 +303,7 @@ mod tests {
|
||||
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
|
||||
assert!(matches!(
|
||||
footer.versioned_footer,
|
||||
VersionedFooter::V2 { store_compression, .. }
|
||||
VersionedFooter::V3 { store_compression, .. }
|
||||
if store_compression == crate::store::COMPRESSION
|
||||
));
|
||||
assert_eq!(&footer.version, crate::version());
|
||||
@@ -288,7 +313,7 @@ mod tests {
|
||||
fn test_serialize_deserialize_footer() {
|
||||
let mut buffer = Vec::new();
|
||||
let crc32 = 123456u32;
|
||||
let footer: Footer = Footer::new(VersionedFooter::V2 {
|
||||
let footer: Footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
});
|
||||
@@ -300,7 +325,7 @@ mod tests {
|
||||
#[test]
|
||||
fn footer_length() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V2 {
|
||||
let versioned_footer = VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
@@ -321,7 +346,7 @@ mod tests {
|
||||
// versionned footer length
|
||||
12 | 128,
|
||||
// index format version
|
||||
2,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
@@ -340,7 +365,7 @@ mod tests {
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
|
||||
assert!(cursor.is_empty());
|
||||
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 {
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
|
||||
crc32: expected_crc,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
|
||||
@@ -174,7 +174,7 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 2;
|
||||
const INDEX_FORMAT_VERSION: u32 = 3;
|
||||
|
||||
/// Structure version for the index.
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -1,32 +1,46 @@
|
||||
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable};
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::query::BM25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
#[inline(always)]
|
||||
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
|
||||
max_tf.min(u8::MAX as u32) as u8
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
|
||||
if max_tf_code == u8::MAX {
|
||||
u32::MAX
|
||||
} else {
|
||||
max_tf_code as u32
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_u32(data: &[u8]) -> u32 {
|
||||
u32::from_le_bytes(data[..4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn write_u32(val: u32, buf: &mut Vec<u8>) {
|
||||
buf.extend_from_slice(&val.to_le_bytes());
|
||||
}
|
||||
|
||||
pub struct SkipSerializer {
|
||||
buffer: Vec<u8>,
|
||||
prev_doc: DocId,
|
||||
}
|
||||
|
||||
impl SkipSerializer {
|
||||
pub fn new() -> SkipSerializer {
|
||||
SkipSerializer {
|
||||
buffer: Vec::new(),
|
||||
prev_doc: 0u32,
|
||||
}
|
||||
SkipSerializer { buffer: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
|
||||
assert!(
|
||||
last_doc > self.prev_doc,
|
||||
"write_doc(...) called with non-increasing doc ids. \
|
||||
Did you forget to call clear maybe?"
|
||||
);
|
||||
let delta_doc = last_doc - self.prev_doc;
|
||||
self.prev_doc = last_doc;
|
||||
delta_doc.serialize(&mut self.buffer).unwrap();
|
||||
write_u32(last_doc, &mut self.buffer);
|
||||
self.buffer.push(doc_num_bits);
|
||||
}
|
||||
|
||||
@@ -35,16 +49,13 @@ impl SkipSerializer {
|
||||
}
|
||||
|
||||
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
|
||||
tf_sum
|
||||
.serialize(&mut self.buffer)
|
||||
.expect("Should never fail");
|
||||
write_u32(tf_sum, &mut self.buffer);
|
||||
}
|
||||
|
||||
pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) {
|
||||
self.buffer.push(fieldnorm_id);
|
||||
let mut buf = [0u8; 8];
|
||||
let bytes = serialize_vint_u32(term_freq, &mut buf);
|
||||
self.buffer.extend_from_slice(bytes);
|
||||
let block_wand_tf = encode_block_wand_max_tf(term_freq);
|
||||
self.buffer
|
||||
.extend_from_slice(&[fieldnorm_id, block_wand_tf]);
|
||||
}
|
||||
|
||||
pub fn data(&self) -> &[u8] {
|
||||
@@ -52,7 +63,6 @@ impl SkipSerializer {
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.prev_doc = 0u32;
|
||||
self.buffer.clear();
|
||||
}
|
||||
}
|
||||
@@ -159,18 +169,13 @@ impl SkipReader {
|
||||
}
|
||||
|
||||
fn read_block_info(&mut self) {
|
||||
let doc_delta = {
|
||||
let bytes = self.owned_read.as_slice();
|
||||
let mut buf = [0; 4];
|
||||
buf.copy_from_slice(&bytes[..4]);
|
||||
u32::from_le_bytes(buf)
|
||||
};
|
||||
self.last_doc_in_block += doc_delta as DocId;
|
||||
let doc_num_bits = self.owned_read.as_slice()[4];
|
||||
|
||||
let bytes = self.owned_read.as_slice();
|
||||
let advance_len: usize;
|
||||
self.last_doc_in_block = read_u32(bytes);
|
||||
let doc_num_bits = bytes[4];
|
||||
match self.skip_info {
|
||||
IndexRecordOption::Basic => {
|
||||
self.owned_read.advance(5);
|
||||
advance_len = 5;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits: 0,
|
||||
@@ -180,11 +185,10 @@ impl SkipReader {
|
||||
};
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
let bytes = self.owned_read.as_slice();
|
||||
let tf_num_bits = bytes[5];
|
||||
let block_wand_fieldnorm_id = bytes[6];
|
||||
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]);
|
||||
self.owned_read.advance(7 + num_bytes);
|
||||
let block_wand_term_freq = decode_block_wand_max_tf(bytes[7]);
|
||||
advance_len = 8;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
@@ -194,16 +198,11 @@ impl SkipReader {
|
||||
};
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
let bytes = self.owned_read.as_slice();
|
||||
let tf_num_bits = bytes[5];
|
||||
let tf_sum = {
|
||||
let mut buf = [0; 4];
|
||||
buf.copy_from_slice(&bytes[6..10]);
|
||||
u32::from_le_bytes(buf)
|
||||
};
|
||||
let tf_sum = read_u32(&bytes[6..10]);
|
||||
let block_wand_fieldnorm_id = bytes[10];
|
||||
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]);
|
||||
self.owned_read.advance(11 + num_bytes);
|
||||
let block_wand_term_freq = decode_block_wand_max_tf(bytes[11]);
|
||||
advance_len = 12;
|
||||
self.block_info = BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
tf_num_bits,
|
||||
@@ -213,6 +212,7 @@ impl SkipReader {
|
||||
};
|
||||
}
|
||||
}
|
||||
self.owned_read.advance(advance_len);
|
||||
}
|
||||
|
||||
pub fn block_info(&self) -> BlockInfo {
|
||||
@@ -274,6 +274,24 @@ mod tests {
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
#[test]
|
||||
fn test_encode_block_wand_max_tf() {
|
||||
for tf in 0..255 {
|
||||
assert_eq!(super::encode_block_wand_max_tf(tf), tf as u8);
|
||||
}
|
||||
for &tf in &[255, 256, 1_000_000, u32::MAX] {
|
||||
assert_eq!(super::encode_block_wand_max_tf(tf), 255);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_block_wand_max_tf() {
|
||||
for tf in 0..255 {
|
||||
assert_eq!(super::decode_block_wand_max_tf(tf), tf as u32);
|
||||
}
|
||||
assert_eq!(super::decode_block_wand_max_tf(255), u32::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_with_freq() {
|
||||
let buf = {
|
||||
|
||||
Reference in New Issue
Block a user