diff --git a/bitpacker/Cargo.toml b/bitpacker/Cargo.toml index 8a3db03fd..576f2be47 100644 --- a/bitpacker/Cargo.toml +++ b/bitpacker/Cargo.toml @@ -15,3 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] + +[dev-dependencies] +rand = "0.8" +proptest = "1" diff --git a/bitpacker/benches/bench.rs b/bitpacker/benches/bench.rs index 8053c63b4..7b61ce799 100644 --- a/bitpacker/benches/bench.rs +++ b/bitpacker/benches/bench.rs @@ -4,9 +4,42 @@ extern crate test; #[cfg(test)] mod tests { + use rand::seq::IteratorRandom; + use rand::thread_rng; + use tantivy_bitpacker::BitPacker; + use tantivy_bitpacker::BitUnpacker; use tantivy_bitpacker::BlockedBitpacker; use test::Bencher; + + #[inline(never)] + fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec { + let mut bitpacker = BitPacker::new(); + let mut buffer = Vec::new(); + for _ in 0..num_els { + // the values do not matter. + bitpacker.write(0u64, bit_width, &mut buffer).unwrap(); + bitpacker.flush(&mut buffer).unwrap(); + } + buffer + } + + #[bench] + fn bench_bitpacking_read(b: &mut Bencher) { + let bit_width = 3; + let num_els = 1_000_000u32; + let bit_unpacker = BitUnpacker::new(bit_width); + let data = create_bitpacked_data(bit_width, num_els); + let idxs: Vec = (0..num_els).choose_multiple(&mut thread_rng(), 100_000); + b.iter(|| { + let mut out = 0u64; + for &idx in &idxs { + out = out.wrapping_add(bit_unpacker.get(idx, &data[..])); + } + out + }); + } + #[bench] fn bench_blockedbitp_read(b: &mut Bencher) { let mut blocked_bitpacker = BlockedBitpacker::new(); @@ -14,9 +47,9 @@ mod tests { blocked_bitpacker.add(val * val); } b.iter(|| { - let mut out = 0; + let mut out = 0u64; for val in 0..=21500 { - out = blocked_bitpacker.get(val); + out = out.wrapping_add(blocked_bitpacker.get(val)); } out }); diff --git a/bitpacker/src/bitpacker.rs b/bitpacker/src/bitpacker.rs index 0a1469b9f..d84382b2a 100644 --- a/bitpacker/src/bitpacker.rs +++ b/bitpacker/src/bitpacker.rs @@ -56,27 +56,31 @@ impl BitPacker { pub fn close(&mut self, output: &mut TWrite) -> io::Result<()> { self.flush(output)?; - // Padding the write file to simplify reads. - output.write_all(&[0u8; 7])?; Ok(()) } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Copy)] pub struct BitUnpacker { - num_bits: u64, + num_bits: u32, mask: u64, } impl BitUnpacker { + /// Creates a bit unpacker, that assumes the same bitwidth for all values. + /// + /// The bitunpacker works by doing an unaligned read of 8 bytes. + /// For this reason, values of `num_bits` between + /// [57..63] are forbidden. pub fn new(num_bits: u8) -> BitUnpacker { + assert!(num_bits <= 7 * 8 || num_bits == 64); let mask: u64 = if num_bits == 64 { !0u64 } else { (1u64 << num_bits) - 1u64 }; BitUnpacker { - num_bits: u64::from(num_bits), + num_bits: u32::from(num_bits), mask, } } @@ -87,28 +91,40 @@ impl BitUnpacker { #[inline] pub fn get(&self, idx: u32, data: &[u8]) -> u64 { - if self.num_bits == 0 { - return 0u64; - } - let addr_in_bits = idx * self.num_bits as u32; + let addr_in_bits = idx * self.num_bits; let addr = (addr_in_bits >> 3) as usize; + if addr + 8 > data.len() { + if self.num_bits == 0 { + return 0; + } + let bit_shift = addr_in_bits & 7; + return self.get_slow_path(addr, bit_shift, data); + } let bit_shift = addr_in_bits & 7; - debug_assert!( - addr + 8 <= data.len(), - "The fast field field should have been padded with 7 bytes." - ); let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap(); let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes); let val_shifted = val_unshifted_unmasked >> bit_shift; val_shifted & self.mask } + + #[inline(never)] + fn get_slow_path(&self, addr: usize, bit_shift: u32, data: &[u8]) -> u64 { + let mut bytes: [u8; 8] = [0u8; 8]; + let available_bytes = data.len() - addr; + // This function is meant to only be called if we did not have 8 bytes to load. + debug_assert!(available_bytes < 8); + bytes[..available_bytes].copy_from_slice(&data[addr..]); + let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes); + let val_shifted = val_unshifted_unmasked >> bit_shift; + val_shifted & self.mask + } } #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker}; - fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec, Vec) { + fn create_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec, Vec) { let mut data = Vec::new(); let mut bitpacker = BitPacker::new(); let max_val: u64 = (1u64 << num_bits as u64) - 1u64; @@ -125,7 +141,7 @@ mod test { } fn test_bitpacker_util(len: usize, num_bits: u8) { - let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits); + let (bitunpacker, vals, data) = create_bitpacker(len, num_bits); for (i, val) in vals.iter().enumerate() { assert_eq!(bitunpacker.get(i as u32, &data), *val); } @@ -139,4 +155,56 @@ mod test { test_bitpacker_util(6, 14); test_bitpacker_util(1000, 14); } + + use proptest::prelude::*; + + fn num_bits_strategy() -> impl Strategy { + prop_oneof!( + Just(0), + Just(1), + 2u8..56u8, + Just(56), + Just(64), + ) + } + + fn vals_strategy() -> impl Strategy)> { + (num_bits_strategy(), 0usize..100usize).prop_flat_map(|(num_bits, len)| { + let max_val = if num_bits == 64 { + u64::MAX + } else { + (1u64 << num_bits as u32) - 1 + }; + let vals = proptest::collection::vec(0..=max_val, len); + vals.prop_map(move |vals| (num_bits, vals)) + }) + } + + fn test_bitpacker_aux(num_bits: u8, vals: &[u64]) { + let mut buffer: Vec = Vec::new(); + let mut bitpacker = BitPacker::new(); + for &val in vals { + bitpacker.write(val, num_bits, &mut buffer).unwrap(); + } + bitpacker.flush(&mut buffer).unwrap(); + assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8); + let bitunpacker = BitUnpacker::new(num_bits); + let max_val = + if num_bits == 64 { + u64::MAX + } else { + (1u64 << num_bits) - 1 + }; + for (i, val) in vals.iter().copied().enumerate() { + assert!(val <= max_val); + assert_eq!(bitunpacker.get(i as u32, &buffer), val); + } + } + + proptest::proptest! { + #[test] + fn test_bitpacker_proptest((num_bits, vals) in vals_strategy()) { + test_bitpacker_aux(num_bits, &vals); + } + } } diff --git a/columnar/src/column_values/serialize.rs b/columnar/src/column_values/serialize.rs index 3ffac7142..5a5f1de2f 100644 --- a/columnar/src/column_values/serialize.rs +++ b/columnar/src/column_values/serialize.rs @@ -296,7 +296,7 @@ pub mod tests { serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap(); // TODO put the header as a footer so that it serves as a padding. // 5 bytes of header, 1 byte of value, 7 bytes of padding. - assert_eq!(buffer.len(), 5 + 1 + 7); + assert_eq!(buffer.len(), 5 + 1); } #[test] @@ -305,7 +305,7 @@ pub mod tests { let col = VecColumn::from(&[true][..]); serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap(); // 5 bytes of header, 0 bytes of value, 7 bytes of padding. - assert_eq!(buffer.len(), 5 + 7); + assert_eq!(buffer.len(), 5); } #[test] @@ -315,6 +315,6 @@ pub mod tests { let col = VecColumn::from(&vals[..]); serialize_column_values(&col, &[FastFieldCodecType::Bitpacked], &mut buffer).unwrap(); // Values are stored over 3 bits. - assert_eq!(buffer.len(), 7 + (3 * 80 / 8) + 7); + assert_eq!(buffer.len(), 7 + (3 * 80 / 8)); } } diff --git a/columnar/src/tests.rs b/columnar/src/tests.rs index b014dd5b0..816c575f6 100644 --- a/columnar/src/tests.rs +++ b/columnar/src/tests.rs @@ -17,7 +17,7 @@ fn test_dataframe_writer_str() { assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("my_string").unwrap(); assert_eq!(cols.len(), 1); - assert_eq!(cols[0].num_bytes(), 165); + assert_eq!(cols[0].num_bytes(), 158); } #[test] @@ -31,7 +31,7 @@ fn test_dataframe_writer_bytes() { assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("my_string").unwrap(); assert_eq!(cols.len(), 1); - assert_eq!(cols[0].num_bytes(), 165); + assert_eq!(cols[0].num_bytes(), 158); } #[test] @@ -45,7 +45,7 @@ fn test_dataframe_writer_bool() { assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("bool.value").unwrap(); assert_eq!(cols.len(), 1); - assert_eq!(cols[0].num_bytes(), 29); + assert_eq!(cols[0].num_bytes(), 22); assert_eq!(cols[0].column_type(), ColumnType::Bool); let dyn_bool_col = cols[0].open().unwrap(); let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); }; @@ -68,7 +68,7 @@ fn test_dataframe_writer_u64_multivalued() { assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("divisor").unwrap(); assert_eq!(cols.len(), 1); - assert_eq!(cols[0].num_bytes(), 43); + assert_eq!(cols[0].num_bytes(), 29); let dyn_i64_col = cols[0].open().unwrap(); let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); }; assert_eq!( @@ -89,7 +89,7 @@ fn test_dataframe_writer_ip_addr() { assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("ip_addr").unwrap(); assert_eq!(cols.len(), 1); - // assert_eq!(cols[0].num_bytes(), 29); + assert_eq!(cols[0].num_bytes(), 42); assert_eq!(cols[0].column_type(), ColumnType::IpAddr); let dyn_bool_col = cols[0].open().unwrap(); let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); }; @@ -123,7 +123,7 @@ fn test_dataframe_writer_numerical() { // - header 14 bytes // - vals 8 //< due to padding? could have been 1byte?. // - null footer 6 bytes - assert_eq!(cols[0].num_bytes(), 40); + assert_eq!(cols[0].num_bytes(), 33); let column = cols[0].open().unwrap(); let DynamicColumn::I64(column_i64) = column else { panic!(); }; assert_eq!(column_i64.idx.get_cardinality(), Cardinality::Optional); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 5ba84ce92..567800344 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -226,7 +226,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 34); + assert_eq!(file.len(), 27); let composite_file = CompositeFile::open(&file)?; let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?; let fast_field_reader = open::(fast_field_bytes)?; @@ -275,7 +275,7 @@ mod tests { serializer.close()?; } let file = directory.open_read(path)?; - assert_eq!(file.len(), 62); + assert_eq!(file.len(), 55); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite @@ -316,7 +316,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 35); + assert_eq!(file.len(), 28); { let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite @@ -355,7 +355,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 80049); + assert_eq!(file.len(), 80042); { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite @@ -397,7 +397,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 49_usize); + assert_eq!(file.len(), 42_usize); { let fast_fields_composite = CompositeFile::open(&file)?; @@ -836,7 +836,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 33); + assert_eq!(file.len(), 26); let composite_file = CompositeFile::open(&file)?; let data = composite_file.open_read(field).unwrap().read_bytes()?; let fast_field_reader = open::(data)?; @@ -874,7 +874,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 45); + assert_eq!(file.len(), 38); let composite_file = CompositeFile::open(&file)?; let data = composite_file.open_read(field).unwrap().read_bytes()?; let fast_field_reader = open::(data)?; @@ -906,7 +906,7 @@ mod tests { } let file = directory.open_read(path).unwrap(); let composite_file = CompositeFile::open(&file)?; - assert_eq!(file.len(), 32); + assert_eq!(file.len(), 25); let data = composite_file.open_read(field).unwrap().read_bytes()?; let fast_field_reader = open::(data)?; assert_eq!(fast_field_reader.get_val(0), false); @@ -940,10 +940,10 @@ mod tests { pub fn test_gcd_date() -> crate::Result<()> { let size_prec_sec = test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Seconds)?; - assert_eq!(size_prec_sec, 5 + 4 + 28 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours); + assert_eq!(size_prec_sec, 5 + 4 + 21 + (1_000 * 13) / 8); // 13 bits per val = ceil(log_2(number of seconds in 2hours); let size_prec_micro = test_gcd_date_with_codec(FastFieldCodecType::Bitpacked, DatePrecision::Microseconds)?; - assert_eq!(size_prec_micro, 5 + 4 + 26 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours); + assert_eq!(size_prec_micro, 5 + 4 + 19 + (1_000 * 33) / 8); // 33 bits per val = ceil(log_2(number of microsecsseconds in 2hours); Ok(()) } diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 799479e7d..389bcb59d 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -312,7 +312,7 @@ mod tests { bitpack.write(51, 6, &mut buffer).unwrap(); assert_eq!(compute_num_bits(51), 6); bitpack.close(&mut buffer).unwrap(); - assert_eq!(buffer.len(), 3 + 7); + assert_eq!(buffer.len(), 3); assert_eq!(extract_bits(&buffer[..], 0, 9), 321u64); assert_eq!(extract_bits(&buffer[..], 9, 2), 2u64); assert_eq!(extract_bits(&buffer[..], 11, 6), 51u64);