diff --git a/TODO.md b/TODO.md index e2ef5304f..a3cefdbc4 100644 --- a/TODO.md +++ b/TODO.md @@ -8,3 +8,4 @@ split postings into blocks add term frequency use skip list for each blocks find a clear way to put the tokenized/untokenized thing upstream +index frequent bigrams diff --git a/cpp/encode.cpp b/cpp/encode.cpp index f4f6f24af..f61f202ea 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -36,14 +36,16 @@ extern "C" { return output_length; } + // returns the number of byte that have been read. size_t decode_sorted_block128_native( const uint32_t* compressed_data, const size_t compressed_size, uint32_t* uncompressed, - const size_t uncompressed_capacity) { - size_t num_ints = uncompressed_capacity; - simd_pack_sorted.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); - return num_ints; + size_t& num_ints) { + // size_t num_ints = uncompressed_capacity; + const uint32_t* pointer_end = simd_pack_sorted.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return static_cast(pointer_end - compressed_data); + } size_t encode_sorted_vint_native( diff --git a/src/compression/block128.rs b/src/compression/block128.rs index f48bd5011..e2b2f1a17 100644 --- a/src/compression/block128.rs +++ b/src/compression/block128.rs @@ -4,7 +4,7 @@ use std::iter; extern { fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: &mut size_t) -> usize; } //------------------------- @@ -48,16 +48,18 @@ impl Block128Decoder { Block128Decoder } - pub fn decode_sorted( + pub fn decode_sorted<'a, 'b>( &self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { + compressed_data: &'a [u32], + uncompressed_values: &'b mut [u32]) -> (&'a[u32], &'b[u32]) { unsafe { - return decode_sorted_block128_native( + let mut uncompressed_len: usize = uncompressed_values.len(); + let consumed_num_bytes: usize = decode_sorted_block128_native( compressed_data.as_ptr(), compressed_data.len() as size_t, uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); + &mut uncompressed_len); + (&compressed_data[consumed_num_bytes..], &uncompressed_values[..uncompressed_len]) } } } @@ -81,7 +83,31 @@ mod tests { assert_eq!(encoded_data.len(), expected_length); let decoder = Block128Decoder::new(); let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); - assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); + let (remaining_input, uncompressed_values) = decoder.decode_sorted(&encoded_data[..], &mut decoded_data); + assert_eq!(remaining_input.len(), 0); + assert_eq!(128, uncompressed_values.len()); + assert_eq!(uncompressed_values, &input[..]); + } + + + #[test] + fn test_partial_decode_block() { + let mut encoder = Block128Encoder::new(); + let expected_length = 21; + let input: Vec = (0u32..128u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data: &[u32] = encoder.encode_sorted(&input); + let mut encoded_vec: Vec = encoded_data.to_vec(); + encoded_vec.push(9u32); + encoded_vec.push(14u32); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Block128Decoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + let (remaining_input, uncompressed_values) = decoder.decode_sorted(&encoded_vec[..], &mut decoded_data); + assert_eq!(remaining_input, [9u32, 14u32]); + assert_eq!(128, uncompressed_values.len()); + assert_eq!(uncompressed_values, &input[..]); } } diff --git a/src/compression/mod.rs b/src/compression/mod.rs index e4ed74ab7..45c07156f 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -11,6 +11,7 @@ mod vints; pub use self::vints::{SortedVIntsEncoder, SortedVIntsDecoder}; +pub const NUM_DOCS_PER_BLOCK: usize = 128; #[cfg(test)] pub mod tests { diff --git a/src/compression/s4bp128.rs b/src/compression/s4bp128.rs index 2069e53bb..f97be6572 100644 --- a/src/compression/s4bp128.rs +++ b/src/compression/s4bp128.rs @@ -2,6 +2,8 @@ use libc::size_t; use std::ptr; + + extern { // complete s4-bp128-dm fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;