diff --git a/CHANGELOG.md b/CHANGELOG.md index 2be1c1319..cd7d663fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ Tantivy 0.14.0 - Bugfix in `Query::explain` - Removed dependency on `notify` #924. Replaced with `FileWatcher` struct that polls meta file every 500ms in background thread. (@halvorboe @guilload) - Added `FilterCollector`, which wraps another collector and filters docs using a predicate over a fast field (@barrotsteindev) +- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@pmasurel) + +This version breaks compatibility and requires users to reindex everything. + Tantivy 0.13.2 =================== diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 3a696896f..b2f495f6c 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -115,6 +115,18 @@ impl Footer { } Ok(()) } + VersionedFooter::V3 { + crc32: _crc, + store_compression, + } => { + if &library_version.store_compression != store_compression { + return Err(Incompatibility::CompressionMismatch { + library_compression_format: library_version.store_compression.to_string(), + index_compression_format: store_compression.to_string(), + }); + } + Ok(()) + } VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch { library_version: library_version.clone(), index_version: self.version.clone(), @@ -136,24 +148,31 @@ pub enum VersionedFooter { crc32: CrcHashU32, store_compression: String, }, + // Block wand max termfred on 1 byte + V3 { + crc32: CrcHashU32, + store_compression: String, + }, } impl BinarySerializable for VersionedFooter { fn serialize(&self, writer: &mut W) -> io::Result<()> { let mut buf = Vec::new(); match self { - VersionedFooter::V2 { + VersionedFooter::V3 { crc32, store_compression: compression, } => { // Serializes a valid `VersionedFooter` or panics if the version is unknown // [ version | crc_hash | compression_mode ] // [ 0..4 | 4..8 | variable ] - BinarySerializable::serialize(&2u32, &mut buf)?; + BinarySerializable::serialize(&3u32, &mut buf)?; BinarySerializable::serialize(crc32, &mut buf)?; BinarySerializable::serialize(compression, &mut buf)?; } - VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => { + VersionedFooter::V2 { .. } + | VersionedFooter::V1 { .. } + | VersionedFooter::UnknownVersion => { return Err(io::Error::new( io::ErrorKind::InvalidInput, "Cannot serialize an unknown versioned footer ", @@ -182,7 +201,7 @@ impl BinarySerializable for VersionedFooter { reader.read_exact(&mut buf[..])?; let mut cursor = &buf[..]; let version = u32::deserialize(&mut cursor)?; - if version != 1 && version != 2 { + if version > 3 { return Ok(VersionedFooter::UnknownVersion); } let crc32 = u32::deserialize(&mut cursor)?; @@ -192,12 +211,17 @@ impl BinarySerializable for VersionedFooter { crc32, store_compression, } - } else { - assert_eq!(version, 2); + } else if version == 2 { VersionedFooter::V2 { crc32, store_compression, } + } else { + assert_eq!(version, 3); + VersionedFooter::V3 { + crc32, + store_compression, + } }) } } @@ -205,6 +229,7 @@ impl BinarySerializable for VersionedFooter { impl VersionedFooter { pub fn crc(&self) -> Option { match self { + VersionedFooter::V3 { crc32, .. } => Some(*crc32), VersionedFooter::V2 { crc32, .. } => Some(*crc32), VersionedFooter::V1 { crc32, .. } => Some(*crc32), VersionedFooter::UnknownVersion { .. } => None, @@ -243,7 +268,7 @@ impl Write for FooterProxy { impl TerminatingWrite for FooterProxy { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { let crc32 = self.hasher.take().unwrap().finalize(); - let footer = Footer::new(VersionedFooter::V2 { + let footer = Footer::new(VersionedFooter::V3 { crc32, store_compression: crate::store::COMPRESSION.to_string(), }); @@ -278,7 +303,7 @@ mod tests { let footer = Footer::deserialize(&mut &vec[..]).unwrap(); assert!(matches!( footer.versioned_footer, - VersionedFooter::V2 { store_compression, .. } + VersionedFooter::V3 { store_compression, .. } if store_compression == crate::store::COMPRESSION )); assert_eq!(&footer.version, crate::version()); @@ -288,7 +313,7 @@ mod tests { fn test_serialize_deserialize_footer() { let mut buffer = Vec::new(); let crc32 = 123456u32; - let footer: Footer = Footer::new(VersionedFooter::V2 { + let footer: Footer = Footer::new(VersionedFooter::V3 { crc32, store_compression: "lz4".to_string(), }); @@ -300,7 +325,7 @@ mod tests { #[test] fn footer_length() { let crc32 = 1111111u32; - let versioned_footer = VersionedFooter::V2 { + let versioned_footer = VersionedFooter::V3 { crc32, store_compression: "lz4".to_string(), }; @@ -321,7 +346,7 @@ mod tests { // versionned footer length 12 | 128, // index format version - 2, + 3, 0, 0, 0, @@ -340,7 +365,7 @@ mod tests { let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap(); assert!(cursor.is_empty()); let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32; - let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 { + let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 { crc32: expected_crc, store_compression: "lz4".to_string(), }; diff --git a/src/lib.rs b/src/lib.rs index f66b54712..33baf80d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -174,7 +174,7 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; /// Index format version. -const INDEX_FORMAT_VERSION: u32 = 2; +const INDEX_FORMAT_VERSION: u32 = 3; /// Structure version for the index. #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 0f90beff9..8d4310eb2 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,32 +1,46 @@ -use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable}; +use std::convert::TryInto; + use crate::directory::OwnedBytes; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; use crate::query::BM25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; +#[inline(always)] +fn encode_block_wand_max_tf(max_tf: u32) -> u8 { + max_tf.min(u8::MAX as u32) as u8 +} + +#[inline(always)] +fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 { + if max_tf_code == u8::MAX { + u32::MAX + } else { + max_tf_code as u32 + } +} + +#[inline(always)] +fn read_u32(data: &[u8]) -> u32 { + u32::from_le_bytes(data[..4].try_into().unwrap()) +} + +#[inline(always)] +fn write_u32(val: u32, buf: &mut Vec) { + buf.extend_from_slice(&val.to_le_bytes()); +} + pub struct SkipSerializer { buffer: Vec, - prev_doc: DocId, } impl SkipSerializer { pub fn new() -> SkipSerializer { - SkipSerializer { - buffer: Vec::new(), - prev_doc: 0u32, - } + SkipSerializer { buffer: Vec::new() } } pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) { - assert!( - last_doc > self.prev_doc, - "write_doc(...) called with non-increasing doc ids. \ - Did you forget to call clear maybe?" - ); - let delta_doc = last_doc - self.prev_doc; - self.prev_doc = last_doc; - delta_doc.serialize(&mut self.buffer).unwrap(); + write_u32(last_doc, &mut self.buffer); self.buffer.push(doc_num_bits); } @@ -35,16 +49,13 @@ impl SkipSerializer { } pub fn write_total_term_freq(&mut self, tf_sum: u32) { - tf_sum - .serialize(&mut self.buffer) - .expect("Should never fail"); + write_u32(tf_sum, &mut self.buffer); } pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) { - self.buffer.push(fieldnorm_id); - let mut buf = [0u8; 8]; - let bytes = serialize_vint_u32(term_freq, &mut buf); - self.buffer.extend_from_slice(bytes); + let block_wand_tf = encode_block_wand_max_tf(term_freq); + self.buffer + .extend_from_slice(&[fieldnorm_id, block_wand_tf]); } pub fn data(&self) -> &[u8] { @@ -52,7 +63,6 @@ impl SkipSerializer { } pub fn clear(&mut self) { - self.prev_doc = 0u32; self.buffer.clear(); } } @@ -159,18 +169,13 @@ impl SkipReader { } fn read_block_info(&mut self) { - let doc_delta = { - let bytes = self.owned_read.as_slice(); - let mut buf = [0; 4]; - buf.copy_from_slice(&bytes[..4]); - u32::from_le_bytes(buf) - }; - self.last_doc_in_block += doc_delta as DocId; - let doc_num_bits = self.owned_read.as_slice()[4]; - + let bytes = self.owned_read.as_slice(); + let advance_len: usize; + self.last_doc_in_block = read_u32(bytes); + let doc_num_bits = bytes[4]; match self.skip_info { IndexRecordOption::Basic => { - self.owned_read.advance(5); + advance_len = 5; self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits: 0, @@ -180,11 +185,10 @@ impl SkipReader { }; } IndexRecordOption::WithFreqs => { - let bytes = self.owned_read.as_slice(); let tf_num_bits = bytes[5]; let block_wand_fieldnorm_id = bytes[6]; - let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]); - self.owned_read.advance(7 + num_bytes); + let block_wand_term_freq = decode_block_wand_max_tf(bytes[7]); + advance_len = 8; self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, @@ -194,16 +198,11 @@ impl SkipReader { }; } IndexRecordOption::WithFreqsAndPositions => { - let bytes = self.owned_read.as_slice(); let tf_num_bits = bytes[5]; - let tf_sum = { - let mut buf = [0; 4]; - buf.copy_from_slice(&bytes[6..10]); - u32::from_le_bytes(buf) - }; + let tf_sum = read_u32(&bytes[6..10]); let block_wand_fieldnorm_id = bytes[10]; - let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]); - self.owned_read.advance(11 + num_bytes); + let block_wand_term_freq = decode_block_wand_max_tf(bytes[11]); + advance_len = 12; self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, @@ -213,6 +212,7 @@ impl SkipReader { }; } } + self.owned_read.advance(advance_len); } pub fn block_info(&self) -> BlockInfo { @@ -274,6 +274,24 @@ mod tests { use crate::directory::OwnedBytes; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; + #[test] + fn test_encode_block_wand_max_tf() { + for tf in 0..255 { + assert_eq!(super::encode_block_wand_max_tf(tf), tf as u8); + } + for &tf in &[255, 256, 1_000_000, u32::MAX] { + assert_eq!(super::encode_block_wand_max_tf(tf), 255); + } + } + + #[test] + fn test_decode_block_wand_max_tf() { + for tf in 0..255 { + assert_eq!(super::decode_block_wand_max_tf(tf), tf as u32); + } + assert_eq!(super::decode_block_wand_max_tf(255), u32::MAX); + } + #[test] fn test_skip_with_freq() { let buf = {