From a00049b879eca0a5dbe1f3c53442da92f99b4902 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Apr 2021 15:13:46 +0200 Subject: [PATCH] add lz4 block format compressor as default docstore compressor add lz4 block compressor using lz4_flex, add lz4-block-compression feature flag add snappy-compression feature flag for snap compressor, make snap crate optional set lz4-block-compression as default feature flag --- Cargo.toml | 7 ++++-- src/directory/footer.rs | 4 +++- src/store/compression_lz4_block.rs | 38 ++++++++++++++++++++++++++++++ src/store/mod.rs | 37 ++++++++++++++++++++++++++--- src/store/reader.rs | 2 +- 5 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 src/store/compression_lz4_block.rs diff --git a/Cargo.toml b/Cargo.toml index 0d9eb85c7..9ab68aa5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,9 +20,10 @@ once_cell = "1" regex ={version = "1", default-features = false, features = ["std"]} tantivy-fst = "0.3" memmap = {version = "0.7", optional=true} +lz4_flex = { version = "0.7", default-features = false, features = ["checked-decode"], optional=true } lz4 = {version="1", optional=true} brotli = {version="3.3.0", optional=true} -snap = "1" +snap = {version="1.0", optional=true} tempfile = {version="3", optional=true} log = "0.4" serde = {version="1", features=["derive"]} @@ -74,10 +75,12 @@ debug-assertions = true overflow-checks = true [features] -default = ["mmap"] +default = ["mmap", "lz4-block-compression" ] mmap = ["fs2", "tempfile", "memmap"] brotli-compression = ["brotli"] lz4-compression = ["lz4"] +lz4-block-compression = ["lz4_flex"] +snappy-compression = ["snap"] failpoints = ["fail/failpoints"] unstable = [] # useful for benches. wasm-bindgen = ["uuid/wasm-bindgen"] diff --git a/src/directory/footer.rs b/src/directory/footer.rs index b2f495f6c..10b8b6f15 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -297,8 +297,10 @@ mod tests { assert!(footer_proxy.terminate().is_ok()); if crate::store::COMPRESSION == "lz4" { assert_eq!(vec.len(), 158); - } else { + } else if crate::store::COMPRESSION == "snappy" { assert_eq!(vec.len(), 167); + } else if crate::store::COMPRESSION == "lz4_block" { + assert_eq!(vec.len(), 176); } let footer = Footer::deserialize(&mut &vec[..]).unwrap(); assert!(matches!( diff --git a/src/store/compression_lz4_block.rs b/src/store/compression_lz4_block.rs new file mode 100644 index 000000000..4e223e2b7 --- /dev/null +++ b/src/store/compression_lz4_block.rs @@ -0,0 +1,38 @@ +use std::io::{self}; + +use core::convert::TryInto; +use lz4_flex::{compress_into, decompress_into}; +/// Name of the compression scheme used in the doc store. +/// +/// This name is appended to the version string of tantivy. +pub const COMPRESSION: &str = "lz4_block"; + +pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { + compressed.clear(); + + compressed.extend_from_slice(&[0, 0, 0, 0]); + compress_into(uncompressed, compressed); + let size = uncompressed.len() as u32; + compressed[0] = size as u8; + compressed[1] = (size >> 8) as u8; + compressed[2] = (size >> 16) as u8; + compressed[3] = (size >> 24) as u8; + Ok(()) +} + +pub fn decompress(compressed: &[u8], decompressed: &mut Vec) -> io::Result<()> { + decompressed.clear(); + //next lz4_flex version will support slice as input parameter. + //this will make the usage much less ugly + let size = compressed.get(..4).ok_or(io::ErrorKind::InvalidData)?; + let size: &[u8; 4] = size.try_into().unwrap(); + let uncompressed_size = u32::from_le_bytes(*size) as usize; + // reserve more than required, because blocked writes may write out of bounds, will be improved + // with lz4_flex 1.0 + decompressed.reserve(uncompressed_size + 4 + 24); + unsafe { + decompressed.set_len(uncompressed_size); + } + decompress_into(&compressed[4..], decompressed).map_err(|_err| io::ErrorKind::InvalidData)?; + Ok(()) +} diff --git a/src/store/mod.rs b/src/store/mod.rs index 6eff6ddd7..e419c99f0 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -39,9 +39,40 @@ mod writer; pub use self::reader::StoreReader; pub use self::writer::StoreWriter; +// compile_error doesn't scale very well, enum like feature flags would be great to have in Rust #[cfg(all(feature = "lz4", feature = "brotli"))] compile_error!("feature `lz4` or `brotli` must not be enabled together."); +#[cfg(all(feature = "lz4_block", feature = "brotli"))] +compile_error!("feature `lz4_block` or `brotli` must not be enabled together."); + +#[cfg(all(feature = "lz4_block", feature = "lz4"))] +compile_error!("feature `lz4_block` or `lz4` must not be enabled together."); + +#[cfg(all(feature = "lz4_block", feature = "snap"))] +compile_error!("feature `lz4_block` or `snap` must not be enabled together."); + +#[cfg(all(feature = "lz4", feature = "snap"))] +compile_error!("feature `lz4` or `snap` must not be enabled together."); + +#[cfg(all(feature = "brotli", feature = "snap"))] +compile_error!("feature `brotli` or `snap` must not be enabled together."); + +#[cfg(not(any( + feature = "lz4", + feature = "brotli", + feature = "lz4_flex", + feature = "snap" +)))] +compile_error!("all compressors are deactivated via feature-flags, check Cargo.toml for available decompressors."); + +#[cfg(feature = "lz4_flex")] +mod compression_lz4_block; +#[cfg(feature = "lz4_flex")] +pub use self::compression_lz4_block::COMPRESSION; +#[cfg(feature = "lz4_flex")] +use self::compression_lz4_block::{compress, decompress}; + #[cfg(feature = "lz4")] mod compression_lz4; #[cfg(feature = "lz4")] @@ -56,11 +87,11 @@ pub use self::compression_brotli::COMPRESSION; #[cfg(feature = "brotli")] use self::compression_brotli::{compress, decompress}; -#[cfg(not(any(feature = "lz4", feature = "brotli")))] +#[cfg(feature = "snap")] mod compression_snap; -#[cfg(not(any(feature = "lz4", feature = "brotli")))] +#[cfg(feature = "snap")] pub use self::compression_snap::COMPRESSION; -#[cfg(not(any(feature = "lz4", feature = "brotli")))] +#[cfg(feature = "snap")] use self::compression_snap::{compress, decompress}; #[cfg(test)] diff --git a/src/store/reader.rs b/src/store/reader.rs index 0c7295305..a07f27149 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -191,7 +191,7 @@ mod tests { .unwrap() .peek_lru() .map(|(&k, _)| k as usize), - Some(18806) + Some(9249) ); Ok(())