mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-23 19:50:42 +00:00
Moved collector out.
This commit is contained in:
32
src/collector/count_collector.rs
Normal file
32
src/collector/count_collector.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl CountCollector {
|
||||
pub fn new() -> CountCollector {
|
||||
CountCollector {
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count(&self,) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, _: DocId) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
40
src/collector/first_n_collector.rs
Normal file
40
src/collector/first_n_collector.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use core::searcher::DocAddress;
|
||||
|
||||
pub struct FirstNCollector {
|
||||
docs: Vec<DocAddress>,
|
||||
current_segment: u32,
|
||||
limit: usize,
|
||||
}
|
||||
|
||||
impl FirstNCollector {
|
||||
pub fn with_limit(limit: usize) -> FirstNCollector {
|
||||
FirstNCollector {
|
||||
docs: Vec::new(),
|
||||
limit: limit,
|
||||
current_segment: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn docs(self,) -> Vec<DocAddress> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FirstNCollector {
|
||||
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
|
||||
self.current_segment = segment_local_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
if self.docs.len() < self.limit {
|
||||
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
109
src/collector/mod.rs
Normal file
109
src/collector/mod.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use DocId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use schema::U32Field;
|
||||
use std::io;
|
||||
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
|
||||
mod first_n_collector;
|
||||
pub use self::first_n_collector::FirstNCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
pub trait Collector {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
|
||||
fn collect(&mut self, doc_id: DocId);
|
||||
}
|
||||
|
||||
pub struct TestCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl TestCollector {
|
||||
pub fn new() -> TestCollector {
|
||||
TestCollector {
|
||||
docs: Vec::new(),
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn docs(self,) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
self.docs.push(doc_id + self.offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u32>,
|
||||
u32_field: U32Field,
|
||||
ff_reader: Option<U32FastFieldReader>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
u32_field: u32_field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(&self,) -> &Vec<u32> {
|
||||
&self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
|
||||
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
|
||||
self.vals.push(val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::new();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
33
src/collector/multi_collector.rs
Normal file
33
src/collector/multi_collector.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::io;
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector {
|
||||
collectors: collectors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
|
||||
for collector in self.collectors.iter_mut() {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
for collector in self.collectors.iter_mut() {
|
||||
collector.collect(doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
87
src/compression/block128.rs
Normal file
87
src/compression/block128.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
use std::iter;
|
||||
|
||||
extern {
|
||||
fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
}
|
||||
|
||||
//-------------------------
|
||||
// Block128
|
||||
|
||||
pub struct Block128Encoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl Block128Encoder {
|
||||
|
||||
pub fn new() -> Block128Encoder {
|
||||
Block128Encoder {
|
||||
input_buffer: Vec::with_capacity(128),
|
||||
output_buffer: iter::repeat(0u32).take(256).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
assert_eq!(input.len(), 128);
|
||||
// TODO use clone_from when available
|
||||
let written_size: usize;
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128);
|
||||
written_size = encode_sorted_block128_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
256,
|
||||
);
|
||||
}
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Block128Decoder;
|
||||
|
||||
impl Block128Decoder {
|
||||
|
||||
pub fn new() -> Block128Decoder {
|
||||
Block128Decoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(
|
||||
&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_sorted_block128_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_encode_block() {
|
||||
let mut encoder = Block128Encoder::new();
|
||||
let expected_length = 21;
|
||||
let input: Vec<u32> = (0u32..128u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Block128Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
|
||||
assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
}
|
||||
14
src/compression/intersection.rs
Normal file
14
src/compression/intersection.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
|
||||
}
|
||||
|
||||
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
|
||||
unsafe {
|
||||
intersection_native(
|
||||
left.as_ptr(), left.len(),
|
||||
right.as_ptr(), right.len(),
|
||||
output.as_mut_ptr())
|
||||
}
|
||||
}
|
||||
@@ -1,267 +1,20 @@
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
use std::iter;
|
||||
mod intersection;
|
||||
pub use self::intersection::intersection;
|
||||
|
||||
extern {
|
||||
// fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
// fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
mod s4bp128;
|
||||
pub use self::s4bp128::{S4BP128Encoder, S4BP128Decoder};
|
||||
|
||||
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
|
||||
mod block128;
|
||||
pub use self::block128::{Block128Encoder, Block128Decoder};
|
||||
|
||||
// complete s4-bp128-dm
|
||||
fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
mod vints;
|
||||
pub use self::vints::{SortedVIntsEncoder, SortedVIntsDecoder};
|
||||
|
||||
// bp128, only encodes group of 128 u32 at a time
|
||||
fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
|
||||
// vints, used as the left over codec for the <128 remaining values
|
||||
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
|
||||
}
|
||||
|
||||
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
|
||||
unsafe {
|
||||
intersection_native(
|
||||
left.as_ptr(), left.len(),
|
||||
right.as_ptr(), right.len(),
|
||||
output.as_mut_ptr())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------
|
||||
// Vint
|
||||
|
||||
|
||||
pub struct VIntEncoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl VIntEncoder {
|
||||
|
||||
pub fn new() -> VIntEncoder {
|
||||
VIntEncoder {
|
||||
input_buffer: Vec::with_capacity(128),
|
||||
output_buffer: iter::repeat(0u32).take(256).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
assert!(input.len() < 128);
|
||||
let input_len = input.len();
|
||||
let written_size: usize;
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
written_size = encode_sorted_vint_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
256,
|
||||
);
|
||||
}
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct VIntDecoder;
|
||||
|
||||
impl VIntDecoder {
|
||||
|
||||
pub fn new() -> VIntDecoder {
|
||||
VIntDecoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_sorted_vint_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------
|
||||
// Block128
|
||||
|
||||
pub struct Block128Encoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl Block128Encoder {
|
||||
|
||||
pub fn new() -> Block128Encoder {
|
||||
Block128Encoder {
|
||||
input_buffer: Vec::with_capacity(128),
|
||||
output_buffer: iter::repeat(0u32).take(256).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
assert_eq!(input.len(), 128);
|
||||
// TODO use clone_from when available
|
||||
let written_size: usize;
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128);
|
||||
written_size = encode_s4_bp128_dm_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
128,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
256,
|
||||
);
|
||||
}
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Block128Decoder;
|
||||
|
||||
impl Block128Decoder {
|
||||
|
||||
pub fn new() -> Block128Decoder {
|
||||
Block128Decoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(
|
||||
&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_s4_bp128_dm_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------
|
||||
// s4-bp128-dm
|
||||
|
||||
|
||||
pub struct S4BP128Encoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl S4BP128Encoder {
|
||||
|
||||
pub fn new() -> S4BP128Encoder {
|
||||
S4BP128Encoder {
|
||||
input_buffer: Vec::new(),
|
||||
output_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len + 10000 >= self.input_buffer.len() {
|
||||
let target_length = input_len + 1024;
|
||||
self.input_buffer.resize(target_length, 0);
|
||||
self.output_buffer.resize(target_length, 0);
|
||||
}
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
let written_size = encode_s4_bp128_dm_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
self.output_buffer.len() as size_t,
|
||||
);
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct S4BP128Decoder;
|
||||
|
||||
impl S4BP128Decoder {
|
||||
|
||||
pub fn new() -> S4BP128Decoder {
|
||||
S4BP128Decoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_s4_bp128_dm_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
|
||||
// pub fn decode_unsorted(&self,
|
||||
// compressed_data: &[u32],
|
||||
// uncompressed_values: &mut [u32]) -> size_t {
|
||||
// unsafe {
|
||||
// return decode_unsorted_native(
|
||||
// compressed_data.as_ptr(),
|
||||
// compressed_data.len() as size_t,
|
||||
// uncompressed_values.as_mut_ptr(),
|
||||
// uncompressed_values.len() as size_t);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// pub struct Intersector {
|
||||
// output_buffer: Vec<u32>,
|
||||
// }
|
||||
//
|
||||
// impl Intersector {
|
||||
// fn new() -> Intersector {
|
||||
// Intersector::with_capacity(1_000_000)
|
||||
// }
|
||||
// fn with_capacity(capacity: usize) -> Intersector {
|
||||
// Intersector {
|
||||
// output_buffer: iter::repeat(0u32).take(capacity).collect()
|
||||
// }
|
||||
// }
|
||||
// fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] {
|
||||
// let max_intersection_length = min(left.len(), right.len());
|
||||
// if self.output_buffer.len() < max_intersection_length {
|
||||
// self.output_buffer.resize(max_intersection_length, 0);
|
||||
// }
|
||||
// unsafe {
|
||||
// let intersection_len = intersection_native(
|
||||
// left.as_ptr(), left.len() as size_t,
|
||||
// right.as_ptr(), right.len() as size_t,
|
||||
// self.output_buffer.as_mut_ptr());
|
||||
// return &self.output_buffer[0..intersection_len];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use std::iter;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
@@ -275,116 +28,7 @@ mod tests {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_big() {
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let num_ints = 10000 as usize;
|
||||
let expected_length = 1274;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = S4BP128Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_block() {
|
||||
let mut encoder = Block128Encoder::new();
|
||||
let expected_length = 21;
|
||||
let input: Vec<u32> = (0u32..128u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Block128Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
|
||||
assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let mut encoder = VIntEncoder::new();
|
||||
let expected_length = 31;
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = VIntDecoder::new();
|
||||
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
|
||||
assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(&decoded_data[0..123], &input[..]);
|
||||
}
|
||||
{
|
||||
let mut encoder = VIntEncoder::new();
|
||||
let input = vec!(3, 17u32, 187);
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), 1);
|
||||
assert_eq!(encoded_data[0], 2167049859u32);
|
||||
}
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_encode_unsorted() {
|
||||
// let mut encoder = Encoder::new();
|
||||
// let num_ints = 10_000 as usize;
|
||||
// let expected_length = 4361;
|
||||
// let input: Vec<u32> = (0..num_ints as u32)
|
||||
// .map(|i| i * 213_127 % 501)
|
||||
// .into_iter().collect();
|
||||
// assert_eq!(input.len(), 10_000);
|
||||
// let encoded_data = encoder.encode_unsorted(&input);
|
||||
// assert_eq!(encoded_data.len(), expected_length);
|
||||
// let decoder = Decoder::new();
|
||||
// let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
// assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data));
|
||||
// assert_eq!(decoded_data, input);
|
||||
// }
|
||||
//
|
||||
// #[test]
|
||||
// fn test_simd_intersection() {
|
||||
// let mut intersector = Intersector::new();
|
||||
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
|
||||
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
|
||||
// let intersection = intersector.intersection(&arr1[..], &arr2[..]) ;
|
||||
// assert_eq!(intersection.len(), 500_233);
|
||||
// }
|
||||
|
||||
#[bench]
|
||||
fn bench_decode(b: &mut Bencher) {
|
||||
const TEST_SIZE: usize = 1_000_000;
|
||||
let arr = generate_array(TEST_SIZE, 0.1);
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let encoded = encoder.encode_sorted(&arr);
|
||||
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
|
||||
let decoder = S4BP128Decoder;
|
||||
b.iter(|| {
|
||||
decoder.decode_sorted(&encoded, &mut uncompressed);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// #[bench]
|
||||
// fn bench_simd_intersection(b: &mut Bencher) {
|
||||
// let mut intersector = Intersector::new();
|
||||
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
|
||||
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
|
||||
// b.iter(|| {
|
||||
// intersector.intersection(&arr1[..], &arr2[..]).len()
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
122
src/compression/s4bp128.rs
Normal file
122
src/compression/s4bp128.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
|
||||
extern {
|
||||
// complete s4-bp128-dm
|
||||
fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
}
|
||||
|
||||
//-------------------------
|
||||
// s4-bp128-dm
|
||||
|
||||
|
||||
pub struct S4BP128Encoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl S4BP128Encoder {
|
||||
|
||||
pub fn new() -> S4BP128Encoder {
|
||||
S4BP128Encoder {
|
||||
input_buffer: Vec::new(),
|
||||
output_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len + 10000 >= self.input_buffer.len() {
|
||||
let target_length = input_len + 1024;
|
||||
self.input_buffer.resize(target_length, 0);
|
||||
self.output_buffer.resize(target_length, 0);
|
||||
}
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
let written_size = encode_s4_bp128_dm_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
self.output_buffer.len() as size_t,
|
||||
);
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct S4BP128Decoder;
|
||||
|
||||
impl S4BP128Decoder {
|
||||
|
||||
pub fn new() -> S4BP128Decoder {
|
||||
S4BP128Decoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_s4_bp128_dm_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
|
||||
// pub fn decode_unsorted(&self,
|
||||
// compressed_data: &[u32],
|
||||
// uncompressed_values: &mut [u32]) -> size_t {
|
||||
// unsafe {
|
||||
// return decode_unsorted_native(
|
||||
// compressed_data.as_ptr(),
|
||||
// compressed_data.len() as size_t,
|
||||
// uncompressed_values.as_mut_ptr(),
|
||||
// uncompressed_values.len() as size_t);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use compression::tests::generate_array;
|
||||
|
||||
#[test]
|
||||
fn test_encode_big() {
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let num_ints = 10000 as usize;
|
||||
let expected_length = 1274;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = S4BP128Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_decode(b: &mut Bencher) {
|
||||
const TEST_SIZE: usize = 1_000_000;
|
||||
let arr = generate_array(TEST_SIZE, 0.1);
|
||||
let mut encoder = S4BP128Encoder::new();
|
||||
let encoded = encoder.encode_sorted(&arr);
|
||||
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
|
||||
let decoder = S4BP128Decoder;
|
||||
b.iter(|| {
|
||||
decoder.decode_sorted(&encoded, &mut uncompressed);
|
||||
});
|
||||
}
|
||||
}
|
||||
97
src/compression/vints.rs
Normal file
97
src/compression/vints.rs
Normal file
@@ -0,0 +1,97 @@
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
use std::iter;
|
||||
|
||||
extern {
|
||||
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
}
|
||||
|
||||
pub struct SortedVIntsEncoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl SortedVIntsEncoder {
|
||||
|
||||
pub fn new() -> SortedVIntsEncoder {
|
||||
SortedVIntsEncoder {
|
||||
input_buffer: Vec::with_capacity(128),
|
||||
output_buffer: iter::repeat(0u32).take(256).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
|
||||
assert!(input.len() < 128);
|
||||
let input_len = input.len();
|
||||
let written_size: usize;
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
written_size = encode_sorted_vint_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr(),
|
||||
256,
|
||||
);
|
||||
}
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct SortedVIntsDecoder;
|
||||
|
||||
impl SortedVIntsDecoder {
|
||||
|
||||
pub fn new() -> SortedVIntsDecoder {
|
||||
SortedVIntsDecoder
|
||||
}
|
||||
|
||||
pub fn decode_sorted(&self,
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
return decode_sorted_vint_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let mut encoder = SortedVIntsEncoder::new();
|
||||
let expected_length = 31;
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = SortedVIntsDecoder::new();
|
||||
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
|
||||
assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(&decoded_data[0..123], &input[..]);
|
||||
}
|
||||
{
|
||||
let mut encoder = SortedVIntsEncoder::new();
|
||||
let input = vec!(3, 17u32, 187);
|
||||
let encoded_data = encoder.encode_sorted(&input);
|
||||
assert_eq!(encoded_data.len(), 1);
|
||||
assert_eq!(encoded_data[0], 2167049859u32);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -75,7 +75,7 @@ impl SegmentPostings {
|
||||
let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
|
||||
unsafe { doc_ids.set_len(doc_freq as usize); }
|
||||
{
|
||||
let decoder = compression::S4BP128Decoder::new();
|
||||
let decoder = S4BP128Decoder::new();
|
||||
decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
|
||||
SegmentPostings(doc_ids)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user