diff --git a/src/core/directory.rs b/src/core/directory.rs index a018e1bc7..088717b84 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -30,6 +30,11 @@ impl Deref for ReadOnlySource { } impl ReadOnlySource { + + pub fn len(&self,) -> usize { + self.as_slice().len() + } + pub fn as_slice(&self,) -> &[u8] { match *self { ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() }, diff --git a/src/core/fastfield.rs b/src/core/fastfield.rs index c612bcd3c..df023257a 100644 --- a/src/core/fastfield.rs +++ b/src/core/fastfield.rs @@ -1,9 +1,44 @@ use std::io::Write; use std::io; +use std::io::Cursor; +use std::io::Seek; +use std::io::SeekFrom; use core::serialize::BinarySerializable; +use core::directory::ReadOnlySource; +use core::schema::DocId; + struct IntFastFieldWriter { - vals: Vec, + vals: Vec, +} + + +pub fn compute_num_bits(amplitude: u32) -> u8 { + if amplitude == 0 { + 0 + } + else { + 1 + compute_num_bits(amplitude / 2) + } +} + +// only works for big-endian +fn serialize_packed_ints>(vals_it: I, num_bits: u8, write: &mut Write) -> io::Result<()> { + let mut mini_buffer_written = 0; + let mut mini_buffer = 0u64; + for val in vals_it { + if mini_buffer_written + num_bits > 64 { + try!(mini_buffer.serialize(write)); + mini_buffer = 0; + mini_buffer_written = 0; + } + mini_buffer |= (val as u64) << mini_buffer_written; + mini_buffer_written += num_bits; + } + if mini_buffer_written > 0 { + try!(mini_buffer.serialize(write)); + } + Ok(()) } impl IntFastFieldWriter { @@ -14,49 +49,189 @@ impl IntFastFieldWriter { } } - pub fn add(&mut self, val: u64) { + pub fn add(&mut self, val: u32) { self.vals.push(val); } - pub fn compute_num_bits(&self, amplitude: u64) -> u8 { - if amplitude == 0 { - 0 - } - else { - 1 + self.compute_num_bits(amplitude / 2) - } - } - pub fn close(&self, write: &mut Write) -> io::Result<()> { - try!((self.vals.len() as u32).serialize(write)); if self.vals.is_empty() { return Ok(()) } let min = self.vals.iter().min().unwrap(); let max = self.vals.iter().max().unwrap(); - let amplitude: u64 = max - min; - let num_bits = self.compute_num_bits(amplitude); - for val in self.vals.iter() { - try!(val.serialize(write)); - } - Ok(()) + try!(min.serialize(write)); + let amplitude: u32 = max - min; + let num_bits: u8 = compute_num_bits(amplitude); + try!(num_bits.serialize(write)); + let vals_it = self.vals.iter().map(|i| i-min); + serialize_packed_ints(vals_it, num_bits, write) } } +pub struct IntFastFieldReader { + data: ReadOnlySource, + min_val: u32, + num_bits: u32, + mask: u32, + num_in_pack: u32, +} + +impl IntFastFieldReader { + pub fn open(data: &ReadOnlySource) -> io::Result { + let mut cursor: Cursor<&[u8]> = Cursor::new(&*data); + let min_val = try!(u32::deserialize(&mut cursor)); + let num_bits = try!(u8::deserialize(&mut cursor)); + let mask = (1 << num_bits) - 1; + let num_in_pack = 64u32 / (num_bits as u32); + Ok(IntFastFieldReader { + min_val: min_val, + num_bits: num_bits as u32, + data: data.slice(5, data.len()), + mask: mask, + num_in_pack: num_in_pack, + }) + } + + pub fn get(&self, doc: DocId) -> u32 { + let mut cursor = Cursor::new(&*self.data); + let long_addr = doc / self.num_in_pack; + let ord_within_long = doc - long_addr * self.num_in_pack; + let bit_shift = (self.num_bits as u32) * ord_within_long; + cursor.seek(SeekFrom::Start((long_addr as u64) * 8u64)).unwrap(); + let val_unshifted_unmasked = u64::deserialize(&mut cursor).unwrap(); + let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32; + return self.min_val + (val_shifted & self.mask); + } +} #[cfg(test)] mod tests { + use super::compute_num_bits; use super::IntFastFieldWriter; + use super::IntFastFieldReader; + use core::directory::ReadOnlySource; + use test::Bencher; + use test; + use rand::Rng; + use rand::SeedableRng; + use rand::XorShiftRng; #[test] - fn test_intfastfieldwriter() { - let mut write: Vec = Vec::new(); - let mut int_fast_field_writer = IntFastFieldWriter::new(); - int_fast_field_writer.add(4u64); - int_fast_field_writer.add(14u64); - int_fast_field_writer.add(2u64); - int_fast_field_writer.close(&mut write).unwrap(); - assert_eq!(write.len(), 8 * 3 + 4); + fn test_compute_num_bits() { + assert_eq!(compute_num_bits(1), 1u8); + assert_eq!(compute_num_bits(0), 0u8); + assert_eq!(compute_num_bits(2), 2u8); + assert_eq!(compute_num_bits(3), 2u8); + assert_eq!(compute_num_bits(4), 3u8); + assert_eq!(compute_num_bits(255), 8u8); + assert_eq!(compute_num_bits(256), 9u8); + } + + #[test] + fn test_intfastfield_small() { + let mut buffer: Vec = Vec::new(); + { + let mut int_fast_field_writer = IntFastFieldWriter::new(); + int_fast_field_writer.add(4u32); + int_fast_field_writer.add(14u32); + int_fast_field_writer.add(2u32); + int_fast_field_writer.close(&mut buffer).unwrap(); + assert_eq!(buffer.len(), 4 + 1 + 8 as usize); + } + { + let source = ReadOnlySource::Anonymous(buffer); + let fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + assert_eq!(fast_field_reader.get(0), 4u32); + assert_eq!(fast_field_reader.get(1), 14u32); + assert_eq!(fast_field_reader.get(2), 2u32); + } + } + + + #[test] + fn test_intfastfield_large() { + let mut buffer: Vec = Vec::new(); + { + let mut int_fast_field_writer = IntFastFieldWriter::new(); + int_fast_field_writer.add(4u32); + int_fast_field_writer.add(14_082_001u32); + int_fast_field_writer.add(3_052u32); + int_fast_field_writer.close(&mut buffer).unwrap(); + assert_eq!(buffer.len(), 21 as usize); + } + { + let source = ReadOnlySource::Anonymous(buffer); + let fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + assert_eq!(fast_field_reader.get(0), 4u32); + assert_eq!(fast_field_reader.get(1), 14_082_001u32); + assert_eq!(fast_field_reader.get(2), 3_052u32); + } + } + + + fn generate_permutation() -> Vec { + let seed: &[u32; 4] = &[1, 2, 3, 4]; + let mut rng = XorShiftRng::from_seed(*seed); + let mut permutation: Vec = (0u32..1_000_000u32).collect(); + rng.shuffle(&mut permutation); + permutation + } + + #[test] + fn test_intfastfield_permutation() { + let mut buffer: Vec = Vec::new(); + let permutation = generate_permutation(); + { + let mut int_fast_field_writer = IntFastFieldWriter::new(); + for x in permutation.iter() { + int_fast_field_writer.add(*x); + } + int_fast_field_writer.close(&mut buffer).unwrap(); + } + let source = ReadOnlySource::Anonymous(buffer); + let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + + let n = test::black_box(100); + let mut a = 0u32; + for _ in 0..n { + assert_eq!(int_fast_field_reader.get(a as u32), permutation[a as usize]); + a = int_fast_field_reader.get(a as u32); + } + } + + + #[bench] + fn bench_intfastfield_veclookup(b: &mut Bencher) { + let permutation = generate_permutation(); + b.iter(|| { + let n = test::black_box(100); + let mut a = 0u32; + for _ in 0..n { + a = permutation[a as usize]; + } + }); + } + + #[bench] + fn bench_intfastfield_fflookup(b: &mut Bencher) { + let mut buffer: Vec = Vec::new(); + { + let permutation = generate_permutation(); + let mut int_fast_field_writer = IntFastFieldWriter::new(); + for x in permutation.iter() { + int_fast_field_writer.add(*x); + } + int_fast_field_writer.close(&mut buffer).unwrap(); + } + let source = ReadOnlySource::Anonymous(buffer); + let int_fast_field_reader = IntFastFieldReader::open(&source).unwrap(); + b.iter(|| { + let n = test::black_box(100); + let mut a = 0u32; + for _ in 0..n { + a = int_fast_field_reader.get(a as u32); + } + }); } }