This commit is contained in:
Paul Masurel
2016-05-01 15:23:35 +09:00
55 changed files with 2809 additions and 2430 deletions

View File

@@ -18,7 +18,6 @@ log = "0.3.5"
combine = "1.2.0"
tempdir = "0.3.4"
bincode = "0.4.0"
serde = "0.6.11"
libc = "0.2.6"
argparse = "*"
num_cpus = "0.2"

View File

@@ -4,20 +4,73 @@
#include "codecfactory.h"
#include "intersection.h"
#include "variablebyte.h"
#include "util.h"
using namespace SIMDCompressionLib;
// sorted
static shared_ptr<IntegerCODEC> codec_sorted = CODECFactory::getFromName("s4-bp128-dm");
// variable byte
static VariableByte<false> codec_unsorted = VariableByte<false>();
static SIMDBinaryPacking<SIMDIntegratedBlockPacker<Max4DeltaSIMD, true>> codec_packed_sorted = SIMDBinaryPacking<SIMDIntegratedBlockPacker<Max4DeltaSIMD, true>>();
static SIMDBinaryPacking<SIMDIntegratedBlockPacker<Max4DeltaSIMD, true>> simd_pack_sorted = SIMDBinaryPacking<SIMDIntegratedBlockPacker<Max4DeltaSIMD, true>>();
static VariableByte<true> vint_codec = VariableByte<true>();
// SIMDBinaryPacking<SIMDBlockPacker<RegularDeltaSIMD, true>
extern "C" {
size_t encode_sorted_native(
// encode 128 u32 at a time.
size_t encode_sorted_block128_native(
uint32_t* begin,
uint32_t* output,
const size_t output_capacity) {
size_t output_length = output_capacity;
simd_pack_sorted.encodeArray(begin,
128,
output,
output_length);
return output_length;
}
size_t decode_sorted_block128_native(
const uint32_t* compressed_data,
const size_t compressed_size,
uint32_t* uncompressed,
const size_t uncompressed_capacity) {
size_t num_ints = uncompressed_capacity;
simd_pack_sorted.decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
return num_ints;
}
size_t encode_sorted_vint_native(
uint32_t* begin,
const size_t num_els,
uint32_t* output,
const size_t output_capacity) {
size_t output_length = output_capacity;
vint_codec.encodeArray(begin,
num_els,
output,
output_length);
return output_length;
}
size_t decode_sorted_vint_native(
const uint32_t* compressed_data,
const size_t compressed_size,
uint32_t* uncompressed,
const size_t uncompressed_capacity) {
size_t num_ints = uncompressed_capacity;
vint_codec.decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
return num_ints;
}
size_t encode_s4_bp128_dm_native(
uint32_t* begin,
const size_t num_els,
uint32_t* output,
@@ -30,6 +83,17 @@ extern "C" {
return output_length;
}
size_t decode_s4_bp128_dm_native(
const uint32_t* compressed_data,
const size_t compressed_size,
uint32_t* uncompressed,
const size_t uncompressed_capacity) {
size_t num_ints = uncompressed_capacity;
codec_sorted -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
return num_ints;
}
size_t encode_unsorted_native(
uint32_t* begin,
const size_t num_els,
@@ -43,15 +107,7 @@ extern "C" {
return output_length;
}
size_t decode_sorted_native(
const uint32_t* compressed_data,
const size_t compressed_size,
uint32_t* uncompressed,
const size_t uncompressed_capacity) {
size_t num_ints = uncompressed_capacity;
codec_sorted -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
return num_ints;
}
size_t decode_unsorted_native(
const uint32_t* compressed_data,

View File

@@ -0,0 +1,32 @@
use std::io;
use super::Collector;
use DocId;
use SegmentReader;
use SegmentLocalId;
pub struct CountCollector {
count: usize,
}
impl CountCollector {
pub fn new() -> CountCollector {
CountCollector {
count: 0,
}
}
pub fn count(&self,) -> usize {
self.count
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId) {
self.count += 1;
}
}

View File

@@ -0,0 +1,40 @@
use std::io;
use super::Collector;
use DocId;
use SegmentReader;
use SegmentLocalId;
use core::searcher::DocAddress;
pub struct FirstNCollector {
docs: Vec<DocAddress>,
current_segment: u32,
limit: usize,
}
impl FirstNCollector {
pub fn with_limit(limit: usize) -> FirstNCollector {
FirstNCollector {
docs: Vec::new(),
limit: limit,
current_segment: 0,
}
}
pub fn docs(self,) -> Vec<DocAddress> {
self.docs
}
}
impl Collector for FirstNCollector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
self.current_segment = segment_local_id;
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
if self.docs.len() < self.limit {
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
}
}
}

109
src/collector/mod.rs Normal file
View File

@@ -0,0 +1,109 @@
use DocId;
use SegmentReader;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use schema::U32Field;
use std::io;
mod count_collector;
pub use self::count_collector::CountCollector;
mod first_n_collector;
pub use self::first_n_collector::FirstNCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;
pub trait Collector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn collect(&mut self, doc_id: DocId);
}
pub struct TestCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
}
impl TestCollector {
pub fn new() -> TestCollector {
TestCollector {
docs: Vec::new(),
offset: 0,
segment_max_doc: 0,
}
}
pub fn docs(self,) -> Vec<DocId> {
self.docs
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.docs.push(doc_id + self.offset);
}
}
pub struct FastFieldTestCollector {
vals: Vec<u32>,
u32_field: U32Field,
ff_reader: Option<U32FastFieldReader>,
}
impl FastFieldTestCollector {
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
u32_field: u32_field,
ff_reader: None,
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
self.vals.push(val);
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::new();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc);
}
count_collector.count()
});
}
}

View File

@@ -0,0 +1,33 @@
use std::io;
use super::Collector;
use DocId;
use SegmentReader;
use SegmentLocalId;
pub struct MultiCollector<'a> {
collectors: Vec<&'a mut Collector>,
}
impl<'a> MultiCollector<'a> {
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector {
collectors: collectors,
}
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
for collector in self.collectors.iter_mut() {
try!(collector.set_segment(segment_local_id, segment));
}
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
for collector in self.collectors.iter_mut() {
collector.collect(doc_id);
}
}
}

9
src/common/mod.rs Normal file
View File

@@ -0,0 +1,9 @@
mod serialize;
mod timer;
mod vint;
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub use self::vint::VInt;

View File

@@ -4,9 +4,13 @@ use std::fmt;
use std::io::Write;
use std::io::Read;
use std::io;
use common::VInt;
use byteorder;
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
}
fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error {
match byteorder_error {
@@ -15,11 +19,6 @@ fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error {
}
}
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
}
impl BinarySerializable for () {
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
Ok(0)
@@ -31,14 +30,14 @@ impl BinarySerializable for () {
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut total_size = try!((self.len() as u32).serialize(writer));
let mut total_size = try!(VInt(self.len() as u64).serialize(writer));
for it in self.iter() {
total_size += try!(it.serialize(writer));
}
Ok(total_size)
}
fn deserialize(reader: &mut Read) -> io::Result<Vec<T>> {
let num_items = try!(u32::deserialize(reader));
let num_items = try!(VInt::deserialize(reader)).val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = try!(T::deserialize(reader));
@@ -99,17 +98,15 @@ impl BinarySerializable for u8 {
impl BinarySerializable for String {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
// TODO error
let data: &[u8] = self.as_bytes();
let mut size = try!((data.len() as u32).serialize(writer));
let mut size = try!(VInt(data.len() as u64).serialize(writer));
size += data.len();
try!(writer.write_all(data));
Ok(size)
}
fn deserialize(reader: &mut Read) -> io::Result<String> {
// TODO error
let string_length = try!(u32::deserialize(reader)) as usize;
let string_length = try!(VInt::deserialize(reader)).val() as usize;
let mut result = String::with_capacity(string_length);
try!(reader.take(string_length as u64).read_to_string(&mut result));
Ok(result)
@@ -120,84 +117,55 @@ impl BinarySerializable for String {
#[cfg(test)]
mod test {
use core::serialize::BinarySerializable;
use std::io::Cursor;
use common::VInt;
use super::*;
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
let mut buffer: Vec<u8> = Vec::new();
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
assert_eq!(buffer.len(), num_bytes);
let mut cursor = Cursor::new(&buffer[..]);
let deser = T::deserialize(&mut cursor).unwrap();
assert_eq!(deser, v);
}
#[test]
fn test_serialize_u8() {
let mut buffer: Vec<u8> = Vec::new();
{
let x: u8 = 3;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 1);
}
{
let x: u8 = 5;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 2);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u8::deserialize(&mut cursor).unwrap());
assert_eq!(5, u8::deserialize(&mut cursor).unwrap());
assert!(u8::deserialize(&mut cursor).is_err());
serialize_test(3u8, 1);
serialize_test(5u8, 1);
}
#[test]
fn test_serialize_u32() {
let mut buffer: Vec<u8> = Vec::new();
{
let x: u32 = 3;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 4);
}
{
let x: u32 = 5;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 8);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u32::deserialize(&mut cursor).unwrap());
assert_eq!(5, u32::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
serialize_test(3u32, 4);
serialize_test(5u32, 4);
serialize_test(u32::max_value(), 4);
}
#[test]
fn test_serialize_string() {
let mut buffer: Vec<u8> = Vec::new();
let first_length = 4 + 3 * 4;
let second_length = 4 + 3 * 8;
{
let x: String = String::from("ぽよぽよ");
assert_eq!(x.serialize(&mut buffer).unwrap(), first_length);
assert_eq!(buffer.len(), first_length);
}
{
let x: String = String::from("富士さん見える。");
assert_eq!(x.serialize(&mut buffer).unwrap(), second_length);
assert_eq!(buffer.len(), first_length + second_length);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap());
assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
serialize_test(String::from(""), 1);
serialize_test(String::from("ぽよぽよ"), 1 + 3*4);
serialize_test(String::from("富士さん見える。"), 1 + 3*8);
}
#[test]
fn test_serialize_vec() {
let mut buffer: Vec<u8> = Vec::new();
let first_length = 4 + 3 * 4;
let second_length = 4 + 3 * 8;
let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。"));
assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4);
let mut cursor = Cursor::new(&buffer[..]);
{
let deser: Vec<String> = Vec::deserialize(&mut cursor).unwrap();
assert_eq!(deser.len(), 2);
assert_eq!("ぽよぽよ", deser[0]);
assert_eq!("富士さん見える。", deser[1]);
}
let v: Vec<u8> = Vec::new();
serialize_test(v, 1);
serialize_test(vec!(1u32, 3u32), 1 + 4*2);
}
#[test]
fn test_serialize_vint() {
serialize_test(VInt(7u64), 1);
serialize_test(VInt(127u64), 1);
serialize_test(VInt(128u64), 2);
serialize_test(VInt(1234u64), 2);
serialize_test(VInt(16_383), 2);
serialize_test(VInt(16_384), 3);
serialize_test(VInt(u64::max_value()), 10);
}
}

84
src/common/timer.rs Normal file
View File

@@ -0,0 +1,84 @@
use time::PreciseTime;
pub struct OpenTimer<'a> {
name: &'static str,
timer_tree: &'a mut TimerTree,
start: PreciseTime,
depth: u32,
}
impl<'a> OpenTimer<'a> {
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
timer_tree: self.timer_tree,
start: PreciseTime::now(),
depth: self.depth + 1,
}
}
}
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self,) {
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
depth: self.depth,
});
}
}
#[derive(Debug)]
pub struct Timing {
name: &'static str,
duration: i64,
depth: u32,
}
#[derive(Debug)]
pub struct TimerTree {
timings: Vec<Timing>,
}
impl TimerTree {
pub fn new() -> TimerTree {
TimerTree {
timings: Vec::new(),
}
}
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
timer_tree: self,
start: PreciseTime::now(),
depth: 0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_timer() {
let mut timer_tree = TimerTree::new();
{
let mut a = timer_tree.open("a");
{
let mut ab = a.open("b");
{
let _abc = ab.open("c");
}
{
let _abd = ab.open("d");
}
}
}
assert_eq!(timer_tree.timings.len(), 4);
}
}

58
src/common/vint.rs Normal file
View File

@@ -0,0 +1,58 @@
use super::BinarySerializable;
use std::io;
use std::io::Write;
use std::io::Read;
#[derive(Debug, Eq, PartialEq)]
pub struct VInt(pub u64);
impl VInt {
pub fn val(&self,) -> u64 {
self.0.clone()
}
}
impl BinarySerializable for VInt {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut remaining = self.0.clone();
let mut written: usize = 0;
let mut buffer = [0u8; 10];
loop {
let mut next_byte: u8 = (remaining % 128u64) as u8;
remaining /= 128u64;
if remaining == 0u64 {
buffer[written] = next_byte;
written += 1;
break;
}
else {
next_byte |= 128u8;
buffer[written] = next_byte;
written += 1;
}
}
try!(writer.write_all(&buffer[0..written]));
Ok(written)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let mut bytes = reader.bytes();
let mut result = 0u64;
let mut shift = 0u64;
loop {
match bytes.next() {
Some(Ok(b)) => {
result += ((b % 128u8) as u64) << shift;
if b & 128 == 0u8 {
break;
}
shift += 7;
}
_ => {
return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer"))
}
}
}
Ok(VInt(result))
}
}

View File

@@ -0,0 +1,87 @@
use libc::size_t;
use std::ptr;
use std::iter;
extern {
fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
//-------------------------
// Block128
pub struct Block128Encoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl Block128Encoder {
pub fn new() -> Block128Encoder {
Block128Encoder {
input_buffer: Vec::with_capacity(128),
output_buffer: iter::repeat(0u32).take(256).collect(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
assert_eq!(input.len(), 128);
// TODO use clone_from when available
let written_size: usize;
unsafe {
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128);
written_size = encode_sorted_block128_native(
self.input_buffer.as_mut_ptr(),
self.output_buffer.as_mut_ptr(),
256,
);
}
return &self.output_buffer[0..written_size];
}
}
pub struct Block128Decoder;
impl Block128Decoder {
pub fn new() -> Block128Decoder {
Block128Decoder
}
pub fn decode_sorted(
&self,
compressed_data: &[u32],
uncompressed_values: &mut [u32]) -> size_t {
unsafe {
return decode_sorted_block128_native(
compressed_data.as_ptr(),
compressed_data.len() as size_t,
uncompressed_values.as_mut_ptr(),
uncompressed_values.len() as size_t);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::iter;
#[test]
fn test_encode_block() {
let mut encoder = Block128Encoder::new();
let expected_length = 21;
let input: Vec<u32> = (0u32..128u32)
.map(|i| i * 7 / 2)
.into_iter()
.collect();
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Block128Decoder::new();
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}
}

View File

@@ -0,0 +1,14 @@
use libc::size_t;
extern {
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
}
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
unsafe {
intersection_native(
left.as_ptr(), left.len(),
right.as_ptr(), right.len(),
output.as_mut_ptr())
}
}

34
src/compression/mod.rs Normal file
View File

@@ -0,0 +1,34 @@
mod intersection;
pub use self::intersection::intersection;
mod s4bp128;
pub use self::s4bp128::{S4BP128Encoder, S4BP128Decoder};
mod block128;
pub use self::block128::{Block128Encoder, Block128Decoder};
mod vints;
pub use self::vints::{SortedVIntsEncoder, SortedVIntsDecoder};
#[cfg(test)]
pub mod tests {
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
}

122
src/compression/s4bp128.rs Normal file
View File

@@ -0,0 +1,122 @@
use libc::size_t;
use std::ptr;
extern {
// complete s4-bp128-dm
fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
//-------------------------
// s4-bp128-dm
pub struct S4BP128Encoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl S4BP128Encoder {
pub fn new() -> S4BP128Encoder {
S4BP128Encoder {
input_buffer: Vec::new(),
output_buffer: Vec::new(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
self.input_buffer.clear();
let input_len = input.len();
if input_len + 10000 >= self.input_buffer.len() {
let target_length = input_len + 1024;
self.input_buffer.resize(target_length, 0);
self.output_buffer.resize(target_length, 0);
}
// TODO use clone_from when available
unsafe {
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
let written_size = encode_s4_bp128_dm_native(
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr(),
self.output_buffer.len() as size_t,
);
return &self.output_buffer[0..written_size];
}
}
}
pub struct S4BP128Decoder;
impl S4BP128Decoder {
pub fn new() -> S4BP128Decoder {
S4BP128Decoder
}
pub fn decode_sorted(&self,
compressed_data: &[u32],
uncompressed_values: &mut [u32]) -> size_t {
unsafe {
return decode_s4_bp128_dm_native(
compressed_data.as_ptr(),
compressed_data.len() as size_t,
uncompressed_values.as_mut_ptr(),
uncompressed_values.len() as size_t);
}
}
// pub fn decode_unsorted(&self,
// compressed_data: &[u32],
// uncompressed_values: &mut [u32]) -> size_t {
// unsafe {
// return decode_unsorted_native(
// compressed_data.as_ptr(),
// compressed_data.len() as size_t,
// uncompressed_values.as_mut_ptr(),
// uncompressed_values.len() as size_t);
// }
// }
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use compression::tests::generate_array;
#[test]
fn test_encode_big() {
let mut encoder = S4BP128Encoder::new();
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = S4BP128Decoder::new();
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}
#[bench]
fn bench_decode(b: &mut Bencher) {
const TEST_SIZE: usize = 1_000_000;
let arr = generate_array(TEST_SIZE, 0.1);
let mut encoder = S4BP128Encoder::new();
let encoded = encoder.encode_sorted(&arr);
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
let decoder = S4BP128Decoder;
b.iter(|| {
decoder.decode_sorted(&encoded, &mut uncompressed);
});
}
}

97
src/compression/vints.rs Normal file
View File

@@ -0,0 +1,97 @@
use libc::size_t;
use std::ptr;
use std::iter;
extern {
fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
pub struct SortedVIntsEncoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl SortedVIntsEncoder {
pub fn new() -> SortedVIntsEncoder {
SortedVIntsEncoder {
input_buffer: Vec::with_capacity(128),
output_buffer: iter::repeat(0u32).take(256).collect(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
assert!(input.len() < 128);
let input_len = input.len();
let written_size: usize;
// TODO use clone_from when available
unsafe {
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
written_size = encode_sorted_vint_native(
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr(),
256,
);
}
return &self.output_buffer[0..written_size];
}
}
pub struct SortedVIntsDecoder;
impl SortedVIntsDecoder {
pub fn new() -> SortedVIntsDecoder {
SortedVIntsDecoder
}
pub fn decode_sorted(&self,
compressed_data: &[u32],
uncompressed_values: &mut [u32]) -> size_t {
unsafe {
return decode_sorted_vint_native(
compressed_data.as_ptr(),
compressed_data.len() as size_t,
uncompressed_values.as_mut_ptr(),
uncompressed_values.len() as size_t);
}
}
}
#[cfg(test)]
mod tests {
use std::iter;
use super::*;
#[test]
fn test_encode_vint() {
{
let mut encoder = SortedVIntsEncoder::new();
let expected_length = 31;
let input: Vec<u32> = (0u32..123u32)
.map(|i| i * 7 / 2)
.into_iter()
.collect();
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = SortedVIntsDecoder::new();
let mut decoded_data: Vec<u32> = iter::repeat(0u32).take(128).collect();
assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
assert_eq!(&decoded_data[0..123], &input[..]);
}
{
let mut encoder = SortedVIntsEncoder::new();
let input = vec!(3, 17u32, 187);
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), 1);
assert_eq!(encoded_data[0], 2167049859u32);
}
}
}

View File

@@ -4,11 +4,12 @@ use rustc_serialize::json;
use core::index::Segment;
use core::index::SegmentInfo;
use core::index::SegmentComponent;
use core::fastfield::FastFieldSerializer;
use core::store::StoreWriter;
use core::postings::PostingsSerializer;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use core::convert_to_ioerror;
use postings::PostingsSerializer;
pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter,

View File

@@ -1,190 +0,0 @@
use core::schema::DocId;
use core::reader::SegmentReader;
use core::searcher::SegmentLocalId;
use core::searcher::DocAddress;
use core::fastfield::U32FastFieldReader;
use core::schema::U32Field;
use std::io;
pub trait Collector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn collect(&mut self, doc_id: DocId);
}
pub struct FirstNCollector {
docs: Vec<DocAddress>,
current_segment: u32,
limit: usize,
}
impl FirstNCollector {
pub fn with_limit(limit: usize) -> FirstNCollector {
FirstNCollector {
docs: Vec::new(),
limit: limit,
current_segment: 0,
}
}
pub fn docs(self,) -> Vec<DocAddress> {
self.docs
}
}
impl Collector for FirstNCollector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
self.current_segment = segment_local_id;
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
if self.docs.len() < self.limit {
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
}
}
}
pub struct CountCollector {
count: usize,
}
impl CountCollector {
pub fn new() -> CountCollector {
CountCollector {
count: 0,
}
}
pub fn count(&self,) -> usize {
self.count
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId) {
self.count += 1;
}
}
pub struct PairCollector<'a, 'b, CollectorLeft: Collector + 'a, CollectorRight: Collector + 'b> {
left: &'a mut CollectorLeft,
right: &'b mut CollectorRight,
}
impl<'a, 'b, CollectorLeft: Collector+ 'a, CollectorRight: Collector + 'b> PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
pub fn from(left: &'a mut CollectorLeft, right: &'b mut CollectorRight) -> PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
PairCollector {
left: left,
right: right,
}
}
}
impl<'a, 'b, CollectorLeft: Collector + 'a, CollectorRight: Collector + 'b>
Collector for PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.left.collect(doc_id);
self.right.collect(doc_id);
}
}
pub struct TestCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
}
impl TestCollector {
pub fn new() -> TestCollector {
TestCollector {
docs: Vec::new(),
offset: 0,
segment_max_doc: 0,
}
}
pub fn docs(self,) -> Vec<DocId> {
self.docs
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.docs.push(doc_id + self.offset);
}
}
pub struct FastFieldTestCollector {
vals: Vec<u32>,
u32_field: U32Field,
ff_reader: Option<U32FastFieldReader>,
}
impl FastFieldTestCollector {
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
u32_field: u32_field,
ff_reader: None,
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
self.vals.push(val);
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::new();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc);
}
count_collector.count()
});
}
}

View File

@@ -1,305 +0,0 @@
use std::io::BufWriter;
use std::marker::Send;
use std::marker::Sync;
use std::io;
use std::io::Cursor;
use std::io::Write;
use std::io::Seek;
use std::io::SeekFrom;
use std::fs::File;
use std::fmt;
use std::collections::HashMap;
use std::collections::hash_map::Entry as HashMapEntry;
use fst::raw::MmapReadOnly;
use atomicwrites;
use std::sync::Arc;
use std::sync::RwLock;
use tempdir::TempDir;
use std::ops::Deref;
use std::path::{Path, PathBuf};
///////////////////////////////////////////////////////////////
pub enum ReadOnlySource {
Mmap(MmapReadOnly),
Anonymous(Vec<u8>),
}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
self.as_slice()
}
}
impl ReadOnlySource {
pub fn len(&self,) -> usize {
self.as_slice().len()
}
pub fn as_slice(&self,) -> &[u8] {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
pub fn cursor<'a>(&'a self) -> Cursor<&'a [u8]> {
Cursor::new(&self.deref())
}
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
let sliced_data: Vec<u8> = Vec::from(&shared_vec[from_offset..to_offset]);
ReadOnlySource::Anonymous(sliced_data)
},
}
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice(0, self.len())
}
}
pub trait SeekableWrite: Seek + Write {}
impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = Box<SeekableWrite>;
//
// #[derive(Debug)]
// pub enum CreateError {
// RootDirectoryDoesNotExist,
// DirectoryAlreadyExists,
// CannotCreateTempDirectory(io::Error),
// }
pub trait Directory: fmt::Debug + Send + Sync {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource>;
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr>;
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
fn sync(&self, path: &Path) -> io::Result<()>;
fn sync_directory(&self,) -> io::Result<()>;
}
////////////////////////////////////////////////////////////////
// MmapDirectory
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: RwLock<HashMap<PathBuf, MmapReadOnly>>,
_temp_directory: Option<TempDir>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
// TODO error management
let tempdir = try!(TempDir::new("index"));
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: Some(tempdir)
};
Ok(directory)
}
pub fn create(filepath: &Path) -> io::Result<MmapDirectory> {
Ok(MmapDirectory {
root_path: PathBuf::from(filepath),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: None
})
}
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.root_path.join(relative_path)
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().unwrap();
let mmap = match mmap_cache.entry(full_path.clone()) {
HashMapEntry::Occupied(e) => e.get().clone(),
HashMapEntry::Vacant(vacant_entry) => {
let new_mmap = try!(MmapReadOnly::open_path(full_path.clone()));
vacant_entry.insert(new_mmap.clone());
new_mmap
}
};
Ok(ReadOnlySource::Mmap(mmap))
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = self.resolve_path(path);
let file = try!(File::create(full_path));
let buf_writer = BufWriter::new(file);
Ok(Box::new(buf_writer))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, path: &Path) -> io::Result<()> {
let full_path = self.resolve_path(path);
File::open(&full_path).and_then(|fd| fd.sync_all())
}
fn sync_directory(&self,) -> io::Result<()> {
File::open(&self.root_path).and_then(|fd| fd.sync_all())
}
}
////////////////////////////////////////////////////////////////
// RAMDirectory
#[derive(Clone)]
struct SharedVec(Arc<RwLock<Cursor<Vec<u8>>>>);
pub struct RAMDirectory {
fs: HashMap<PathBuf, SharedVec>,
}
impl SharedVec {
fn new() -> SharedVec {
SharedVec(Arc::new( RwLock::new(Cursor::new(Vec::new())) ))
}
}
impl Write for SharedVec {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
try!(self.0.write().unwrap().write(buf));
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl Seek for SharedVec {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.0.write().unwrap().seek(pos)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RAMDirectory")
}
}
impl RAMDirectory {
pub fn create() -> RAMDirectory {
RAMDirectory {
fs: HashMap::new()
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
match self.fs.get(path) {
Some(ref data) => {
let data_copy = (*data).0.read().unwrap().clone();
Ok(ReadOnlySource::Anonymous(data_copy.into_inner()))
},
None =>
Err(io::Error::new(io::ErrorKind::NotFound, format!("File has never been created. {:?}", path)))
}
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = PathBuf::from(&path);
let data = SharedVec::new();
self.fs.insert(full_path, data.clone());
Ok(Box::new(data))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, _: &Path) -> io::Result<()> {
Ok(())
}
fn sync_directory(&self,) -> io::Result<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
fn test_directory(directory: &mut Directory) {
{
let mut write_file = directory.open_write(Path::new("toto")).unwrap();
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
}
let read_file = directory.open_read(Path::new("toto")).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data.len(), 5);
assert_eq!(data[0], 4);
assert_eq!(data[1], 3);
assert_eq!(data[2], 7);
assert_eq!(data[3], 3);
assert_eq!(data[4], 5);
}
}

View File

@@ -1,14 +1,14 @@
use std::path::{PathBuf, Path};
use std::io;
use core::schema::Schema;
use core::schema::DocId;
use schema::Schema;
use DocId;
use std::io::Write;
use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard};
use std::fmt;
use rustc_serialize::json;
use std::io::Read;
use std::io::ErrorKind as IOErrorKind;
use core::directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr};
use directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr};
use core::writer::IndexWriter;
use core::searcher::Searcher;
use uuid::Uuid;
@@ -238,7 +238,7 @@ pub struct SegmentInfo {
pub enum SegmentComponent {
INFO,
POSTINGS,
// POSITIONS,
POSITIONS,
FASTFIELDS,
TERMS,
STORE,
@@ -264,7 +264,7 @@ impl Segment {
fn path_suffix(component: &SegmentComponent)-> &'static str {
match *component {
// SegmentComponent::POSITIONS => ".pos",
SegmentComponent::POSITIONS => ".pos",
SegmentComponent::INFO => ".info",
SegmentComponent::POSTINGS => ".idx",
SegmentComponent::TERMS => ".term",

View File

@@ -1,22 +1,20 @@
use std::io;
use core::reader::SegmentReader;
use core::index::Segment;
use core::schema::DocId;
use DocId;
use core::index::SerializableSegment;
use core::codec::SegmentSerializer;
use core::postings::PostingsSerializer;
use core::postings::TermInfo;
use postings::PostingsSerializer;
use postings::TermInfo;
use std::collections::BinaryHeap;
use core::fstmap::FstMapIter;
use core::schema::Term;
use core::schema::Schema;
use core::fastfield::FastFieldSerializer;
use core::store::StoreWriter;
use datastruct::FstMapIter;
use schema::{Term, Schema, U32Field};
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use core::index::SegmentInfo;
use std::cmp::Ordering;
use core::schema::U32Field;
use std::cmp::min;
use std::cmp::max;
use std::cmp::{min, max, Ordering};
struct PostingsMerger<'a> {
doc_ids: Vec<DocId>,
@@ -181,7 +179,9 @@ impl IndexMerger {
match postings_merger.next() {
Some((term, doc_ids)) => {
try!(postings_serializer.new_term(&Term::from(&term), doc_ids.len() as DocId));
try!(postings_serializer.write_docs(doc_ids));
for doc_id in doc_ids.iter() {
try!(postings_serializer.write_doc(doc_id.clone(), None));
}
}
None => { break; }
}
@@ -210,13 +210,13 @@ impl SerializableSegment for IndexMerger {
#[cfg(test)]
mod tests {
use core::schema;
use core::schema::Document;
use schema;
use schema::Document;
use schema::Term;
use core::index::Index;
use core::schema::Term;
use core::searcher::DocAddress;
use core::collector::FastFieldTestCollector;
use core::collector::TestCollector;
use collector::FastFieldTestCollector;
use collector::TestCollector;
#[test]
fn test_index_merger() {

View File

@@ -1,21 +1,9 @@
pub mod postings;
pub mod schema;
pub mod directory;
pub mod writer;
pub mod analyzer;
pub mod reader;
pub mod codec;
pub mod searcher;
pub mod collector;
pub mod serialize;
pub mod store;
pub mod simdcompression;
pub mod fstmap;
pub mod index;
pub mod fastfield;
pub mod fastdivide;
pub mod merger;
pub mod timer;
use std::error;
use std::io;

View File

@@ -1,279 +0,0 @@
use core::schema::DocId;
use std::ptr;
use core::schema::Term;
use core::fstmap::FstMapBuilder;
use core::index::Segment;
use core::directory::WritePtr;
use core::index::SegmentComponent;
use core::simdcompression;
use core::serialize::BinarySerializable;
use std::io::{Read, Write};
use std::io;
use std::collections::HashMap;
#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
pub struct TermInfo {
pub doc_freq: u32,
pub postings_offset: u32,
}
impl BinarySerializable for TermInfo {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
Ok(
try!(self.doc_freq.serialize(writer)) +
try!(self.postings_offset.serialize(writer))
)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let offset = try!(u32::deserialize(reader));
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: offset,
})
}
}
pub struct PostingsWriter {
postings: Vec<Vec<DocId>>,
term_index: HashMap<Term, usize>,
}
impl PostingsWriter {
pub fn new() -> PostingsWriter {
PostingsWriter {
postings: Vec::new(),
term_index: HashMap::new(),
}
}
pub fn suscribe(&mut self, doc: DocId, term: Term) {
let doc_ids: &mut Vec<DocId> = self.get_term_postings(term);
if doc_ids.len() == 0 || doc_ids[doc_ids.len() - 1] < doc {
doc_ids.push(doc);
}
}
fn get_term_postings(&mut self, term: Term) -> &mut Vec<DocId> {
match self.term_index.get(&term) {
Some(unord_id) => {
return &mut self.postings[*unord_id];
},
None => {}
}
let unord_id = self.term_index.len();
self.postings.push(Vec::new());
self.term_index.insert(term, unord_id.clone());
&mut self.postings[unord_id]
}
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> {
let mut sorted_terms: Vec<(&Term, &usize)> = self.term_index.iter().collect();
sorted_terms.sort();
for (term, postings_id) in sorted_terms.into_iter() {
let doc_ids = &self.postings[postings_id.clone()];
let term_docfreq = doc_ids.len() as u32;
try!(serializer.new_term(&term, term_docfreq));
try!(serializer.write_docs(&doc_ids));
}
Ok(())
}
}
//////////////////////////////////
pub trait Postings: Iterator<Item=DocId> {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId>;
}
pub struct IntersectionPostings<T: Postings> {
postings: Vec<T>,
}
impl<T: Postings> IntersectionPostings<T> {
pub fn from_postings(postings: Vec<T>) -> IntersectionPostings<T> {
IntersectionPostings {
postings: postings,
}
}
}
impl<T: Postings> Iterator for IntersectionPostings<T> {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
let mut candidate;
match self.postings[0].next() {
Some(val) => {
candidate = val;
},
None => {
return None;
}
}
'outer: loop {
for i in 1..self.postings.len() {
let skip_result = self.postings[i].skip_next(candidate);
match skip_result {
None => {
return None;
},
Some(x) if x == candidate => {
},
Some(greater) => {
unsafe {
let pa: *mut T = &mut self.postings[i];
let pb: *mut T = &mut self.postings[0];
ptr::swap(pa, pb);
}
candidate = greater;
continue 'outer;
},
}
}
return Some(candidate);
}
}
}
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
postings_write: WritePtr,
written_bytes_postings: usize,
encoder: simdcompression::Encoder,
}
impl PostingsSerializer {
pub fn open(segment: &Segment) -> io::Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
written_bytes_postings: 0,
encoder: simdcompression::Encoder::new(),
})
}
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
let term_info = TermInfo {
doc_freq: doc_freq,
postings_offset: self.written_bytes_postings as u32,
};
self.terms_fst_builder
.insert(term.as_slice(), &term_info)
}
pub fn write_docs(&mut self, doc_ids: &[DocId]) -> io::Result<()> {
let docs_data = self.encoder.encode_sorted(doc_ids);
self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
for num in docs_data {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}
Ok(())
}
pub fn close(mut self,) -> io::Result<()> {
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use core::schema::DocId;
#[derive(Debug)]
pub struct VecPostings {
doc_ids: Vec<DocId>,
cursor: usize,
}
impl VecPostings {
pub fn new(vals: Vec<DocId>) -> VecPostings {
VecPostings {
doc_ids: vals,
cursor: 0,
}
}
}
impl Postings for VecPostings {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
loop {
match Iterator::next(self) {
Some(val) if val >= target => {
return Some(val);
},
None => {
return None;
},
_ => {}
}
}
}
}
impl Iterator for VecPostings {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
if self.cursor >= self.doc_ids.len() {
None
}
else {
self.cursor += 1;
Some(self.doc_ids[self.cursor - 1])
}
}
}
#[test]
fn test_intersection() {
{
let left = VecPostings::new(vec!(1, 3, 9));
let right = VecPostings::new(vec!(3, 4, 9, 18));
let inter = IntersectionPostings::from_postings(vec!(left, right));
let vals: Vec<DocId> = inter.collect();
assert_eq!(vals, vec!(3, 9));
}
{
let a = VecPostings::new(vec!(1, 3, 9));
let b = VecPostings::new(vec!(3, 4, 9, 18));
let c = VecPostings::new(vec!(1, 5, 9, 111));
let inter = IntersectionPostings::from_postings(vec!(a, b, c));
let vals: Vec<DocId> = inter.collect();
assert_eq!(vals, vec!(9));
}
}
#[bench]
fn bench_single_intersection(b: &mut Bencher) {
b.iter(|| {
let docs = VecPostings::new((0..1_000_000).collect());
let intersection = IntersectionPostings::from_postings(vec!(docs));
intersection.count()
});
}
}

View File

@@ -1,27 +1,27 @@
use core::index::{Segment, SegmentId};
use core::schema::Term;
use core::store::StoreReader;
use core::schema::Document;
use core::postings::IntersectionPostings;
use core::directory::ReadOnlySource;
use schema::Term;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use std::io::Cursor;
use core::schema::DocId;
use DocId;
use core::index::SegmentComponent;
use core::simdcompression::Decoder;
use std::io;
use std::str;
use core::postings::TermInfo;
use core::fstmap::FstMap;
use postings::TermInfo;
use datastruct::FstMap;
use std::fmt;
use rustc_serialize::json;
use core::index::SegmentInfo;
use core::timer::TimerHandle;
use core::schema::U32Field;
use common::TimerTree;
use common::Timing;
use common::OpenTimer;
use schema::U32Field;
use core::convert_to_ioerror;
use core::serialize::BinarySerializable;
use core::fastfield::U32FastFieldsReader;
use core::fastfield::U32FastFieldReader;
use core::simdcompression;
use common::BinarySerializable;
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use compression;
use compression::S4BP128Decoder;
use std::mem;
impl fmt::Debug for SegmentReader {
@@ -43,7 +43,7 @@ pub fn intersection(mut postings: Vec<SegmentPostings>) -> SegmentPostings {
let mut pair = (output, buffer);
for posting in postings.iter() {
pair = (pair.1, pair.0);
let output_len = simdcompression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice());
let output_len = compression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice());
unsafe { pair.1.set_len(output_len); }
}
SegmentPostings(pair.1)
@@ -77,8 +77,8 @@ impl SegmentPostings {
let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
unsafe { doc_ids.set_len(doc_freq as usize); }
{
let decoder = Decoder::new();
let num_doc_ids = decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
let decoder = S4BP128Decoder::new();
decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
SegmentPostings(doc_ids)
}
}
@@ -194,7 +194,7 @@ impl SegmentReader {
/// Returns the list of doc ids containing all of the
/// given terms.
pub fn search<'a>(&self, terms: &Vec<Term>, mut timer: TimerHandle<'a>) -> SegmentPostings {
pub fn search<'a>(&self, terms: &Vec<Term>, mut timer: OpenTimer<'a>) -> SegmentPostings {
if terms.len() == 1 {
match self.get_term(&terms[0]) {
Some(term_info) => {
@@ -212,7 +212,7 @@ impl SegmentReader {
for term in terms.iter() {
match self.get_term(term) {
Some(term_info) => {
let decode_one_timer = decode_timer.open("decode_one");
let _decode_one_timer = decode_timer.open("decode_one");
let segment_posting = self.read_postings(&term_info);
segment_postings.push(segment_posting);
}
@@ -224,7 +224,7 @@ impl SegmentReader {
}
}
{
let intersection_time = timer.open("intersection");
let _intersection_time = timer.open("intersection");
intersection(segment_postings)
}
}

View File

@@ -1,563 +0,0 @@
use std::io::Write;
use std::collections::HashMap;
use std::slice;
use std::fmt;
use std::io;
use std::io::Read;
use core::serialize::BinarySerializable;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use std::ops::BitOr;
use std::borrow::Borrow;
use std::convert::AsRef;
/// u32 identifying a document within a segment.
/// Document gets their doc id assigned incrementally,
/// as they are added in the segment.
pub type DocId = u32;
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct TextOptions {
tokenized_indexed: bool,
stored: bool,
fast: bool,
}
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct U32Options {
indexed: bool,
fast: bool,
stored: bool,
}
/// The field will be tokenized and indexed
pub const TEXT: TextOptions = TextOptions {
tokenized_indexed: true,
stored: false,
fast: false,
};
/// The field will be tokenized and indexed
pub const FAST_U32: U32Options = U32Options {
indexed: false,
stored: false,
fast: true,
};
/// A stored fields of a document can be retrieved given its DocId.
/// Stored field are stored together and LZ4 compressed.
/// Reading the stored fields of a document is relatively slow.
/// (100 microsecs)
pub const STORED: TextOptions = TextOptions {
tokenized_indexed: false,
stored: true,
fast: false,
};
/// Fast field are used for field you need to access many times during
/// collection. (e.g: for sort, aggregates).
pub const FAST: TextOptions = TextOptions {
tokenized_indexed: false,
stored: false,
fast: true
};
impl BitOr for TextOptions {
type Output = TextOptions;
fn bitor(self, other: TextOptions) -> TextOptions {
let mut res = TextOptions::new();
res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
res.stored = self.stored || other.stored;
res.fast = self.fast || other.fast;
res
}
}
/// Field handle
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct U32Field(pub u8);
/// Field handle
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct TextField(pub u8);
impl U32Options {
pub fn new() -> U32Options {
U32Options {
fast: false,
indexed: false,
stored: false,
}
}
pub fn is_indexed(&self,) -> bool {
self.indexed
}
pub fn set_indexed(mut self,) -> U32Options {
self.indexed = true;
self
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_fast(mut self,) -> U32Options {
self.fast = true;
self
}
}
impl TextOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
}
pub fn is_stored(&self,) -> bool {
self.stored
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_stored(mut self,) -> TextOptions {
self.stored = true;
self
}
pub fn set_fast(mut self,) -> TextOptions {
self.fast = true;
self
}
pub fn set_tokenized_indexed(mut self,) -> TextOptions {
self.tokenized_indexed = true;
self
}
pub fn new() -> TextOptions {
TextOptions {
fast: false,
tokenized_indexed: false,
stored: false,
}
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct U32FieldValue {
pub field: U32Field,
pub value: u32,
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct TextFieldValue {
pub field: TextField,
pub text: String,
}
impl BinarySerializable for TextField {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let TextField(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<TextField> {
u8::deserialize(reader).map(TextField)
}
}
impl BinarySerializable for U32Field {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let U32Field(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<U32Field> {
u8::deserialize(reader).map(U32Field)
}
}
impl BinarySerializable for TextFieldValue {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
Ok(
try!(self.field.serialize(writer)) +
try!(self.text.serialize(writer))
)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let field = try!(TextField::deserialize(reader));
let text = try!(String::deserialize(reader));
Ok(TextFieldValue {
field: field,
text: text,
})
}
}
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term {
data: Vec<u8>,
}
impl AsRef<[u8]> for Term {
fn as_ref(&self) -> &[u8] {
self.data.as_ref()
}
}
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
struct TextFieldEntry {
name: String,
option: TextOptions,
}
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
pub struct U32FieldEntry {
pub name: String,
pub option: U32Options,
}
/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::{Schema, TextOptions};
///
/// fn create_schema() -> Schema {
/// let mut schema = Schema::new();
/// let str_fieldtype = TextOptions::new();
/// let text_fieldtype = TextOptions::new().set_tokenized_indexed();
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let title_field = schema.add_text_field("title", &text_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// schema
/// }
///
/// let schema = create_schema();
#[derive(Clone, Debug)]
pub struct Schema {
text_fields: Vec<TextFieldEntry>,
text_fields_map: HashMap<String, TextField>, // transient
u32_fields: Vec<U32FieldEntry>,
u32_fields_map: HashMap<String, U32Field>, // transient
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema = Schema::new();
try!(d.read_seq(|d, num_fields| {
for _ in 0..num_fields {
let field_entry = try!(TextFieldEntry::decode(d));
let field_options: &TextOptions = &field_entry.option;
schema.add_text_field(&field_entry.name, field_options);
}
Ok(())
}));
Ok(schema)
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.text_fields.len(),
|mut e| {
for (ord, field) in self.text_fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
}
Ok(())
}));
Ok(())
}
}
impl Schema {
/// Creates a new, empty schema.
pub fn new() -> Schema {
Schema {
text_fields: Vec::new(),
text_fields_map: HashMap::new(),
u32_fields: Vec::new(),
u32_fields_map: HashMap::new(),
}
}
pub fn get_u32_fields(&self,) -> &Vec<U32FieldEntry> {
&self.u32_fields
}
/// Given a name, returns the field handle, as well as its associated TextOptions
pub fn get_text_field(&self, field_name: &str) -> Option<(TextField, TextOptions)> {
self.text_fields_map
.get(field_name)
.map(|&TextField(field_id)| {
let field_options = self.text_fields[field_id as usize].option.clone();
(TextField(field_id), field_options)
})
}
pub fn get_u32_field(&self, field_name: &str) -> Option<(U32Field, U32Options)> {
self.u32_fields_map
.get(field_name)
.map(|&U32Field(field_id)| {
let u32_field_options = self.u32_fields[field_id as usize].option.clone();
(U32Field(field_id), u32_field_options)
})
}
/// Returns the field options associated with a given name.
///
/// # Panics
/// Panics if the field name does not exist.
/// It is meant as an helper for user who created
/// and control the content of their schema.
///
/// If panicking is not an option for you,
/// you may use `get(&self, field_name: &str)`.
pub fn text_field(&self, fieldname: &str) -> TextField {
self.text_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
pub fn u32_field(&self, fieldname: &str) -> U32Field {
self.u32_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
/// Returns the field options associated to a field handle.
pub fn text_field_options(&self, field: &TextField) -> TextOptions {
let TextField(field_id) = *field;
self.text_fields[field_id as usize].option.clone()
}
pub fn u32_field_options(&self, field: &U32Field) -> U32Options {
let U32Field(field_id) = *field;
self.u32_fields[field_id as usize].option.clone()
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_text_field<RefTextOptions: Borrow<TextOptions>>(&mut self, field_name_str: &str, field_options: RefTextOptions) -> TextField {
let field = TextField(self.text_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.text_fields.push(TextFieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.text_fields_map.insert(field_name, field.clone());
field
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_u32_field<RefU32Options: Borrow<U32Options>>(&mut self, field_name_str: &str, field_options: RefU32Options) -> U32Field {
let field = U32Field(self.u32_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.u32_fields.push(U32FieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.u32_fields_map.insert(field_name, field.clone());
field
}
}
impl Term {
// pub fn field_text(&self,) -> TextField {
// TextField(self.data[0])
// }
//
// pub fn text(&self,) -> &str {
// str::from_utf8(&self.data[1..]).unwrap()
// }
pub fn from_field_u32(field: &U32Field, val: u32) -> Term {
let mut buffer = Vec::with_capacity(1 + 4);
let U32Field(field_idx) = *field;
buffer.clear();
buffer.push(128 | field_idx);
val.serialize(&mut buffer).unwrap();
Term {
data: buffer,
}
}
pub fn from_field_text(field: &TextField, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let TextField(field_idx) = *field;
buffer.clear();
buffer.push(field_idx);
buffer.extend(text.as_bytes());
Term {
data: buffer,
}
}
pub fn from(data: &[u8]) -> Term {
Term {
data: Vec::from(data),
}
}
pub fn as_slice(&self,)->&[u8] {
&self.data
}
}
impl fmt::Debug for Term {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Term({})", self.data[0])
}
}
///
/// Document are really just a list of field values.
///
/// # Examples
///
/// ```
/// use tantivy::schema::Schema;
/// use tantivy::schema::TEXT;
///
/// let mut schema = Schema::new();
/// schema.add_text_field("body", &TEXT);
/// let field_text = schema.text_field("body");
/// ```
///
#[derive(Debug)]
pub struct Document {
pub text_field_values: Vec<TextFieldValue>,
pub u32_field_values: Vec<U32FieldValue>,
}
impl Document {
pub fn new() -> Document {
Document {
text_field_values: Vec::new(),
u32_field_values: Vec::new(),
}
}
pub fn from(text_field_values: Vec<TextFieldValue>,
u32_field_values: Vec<U32FieldValue>) -> Document {
Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values
}
}
pub fn len(&self,) -> usize {
self.text_field_values.len()
}
pub fn set(&mut self, field: &TextField, text: &str) {
self.add(TextFieldValue {
field: field.clone(),
text: String::from(text)
});
}
pub fn set_u32(&mut self, field: &U32Field, value: u32) {
self.u32_field_values.push(U32FieldValue {
field: field.clone(),
value: value
});
}
pub fn add(&mut self, field_value: TextFieldValue) {
self.text_field_values.push(field_value);
}
pub fn text_fields<'a>(&'a self,) -> slice::Iter<'a, TextFieldValue> {
self.text_field_values.iter()
}
pub fn u32_fields<'a>(&'a self,) -> slice::Iter<'a, U32FieldValue> {
self.u32_field_values.iter()
}
pub fn get_u32(&self, field: &U32Field) -> Option<u32> {
self.u32_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.value)
.cloned()
.next()
}
pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.collect()
}
pub fn get_first_text<'a>(&'a self, field: &TextField) -> Option<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.next()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_field_options() {
{
let field_options = STORED | FAST;
assert!(field_options.is_stored());
assert!(field_options.is_fast());
assert!(!field_options.is_tokenized_indexed());
}
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_fast());
assert!(field_options.is_tokenized_indexed());
}
}
#[test]
fn test_schema() {
{
let mut schema = Schema::new();
schema.add_text_field("body", &TEXT);
let field = schema.text_field("body");
assert!(schema.text_field_options(&field).is_tokenized_indexed());
}
}
}

View File

@@ -1,12 +1,11 @@
use core::reader::SegmentReader;
use core::index::Index;
use core::index::Segment;
use core::schema::DocId;
use core::schema::Document;
use core::collector::Collector;
use DocId;
use schema::{Document, Term};
use collector::Collector;
use std::io;
use core::timer::TimerTree;
use core::schema::Term;
use common::TimerTree;
#[derive(Debug)]
pub struct Searcher {
@@ -56,12 +55,12 @@ impl Searcher {
for (segment_ord, segment) in self.segments.iter().enumerate() {
let mut segment_search_timer = search_timer.open("segment_search");
{
let set_segment_timer = segment_search_timer.open("set_segment");
let _ = segment_search_timer.open("set_segment");
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
}
let postings = segment.search(terms, segment_search_timer.open("get_postings"));
{
let collection_timer = segment_search_timer.open("collection");
let _collection_timer = segment_search_timer.open("collection");
for doc_id in postings {
collector.collect(doc_id);
}

View File

@@ -1,234 +0,0 @@
use libc::size_t;
use std::ptr;
extern {
// fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
// fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
unsafe {
intersection_native(
left.as_ptr(), left.len(),
right.as_ptr(), right.len(),
output.as_mut_ptr())
}
}
pub struct Encoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl Encoder {
pub fn new() -> Encoder {
Encoder {
input_buffer: Vec::new(),
output_buffer: Vec::new(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
self.input_buffer.clear();
let input_len = input.len();
if input_len + 10000 >= self.input_buffer.len() {
let target_length = input_len + 1024;
self.input_buffer.resize(target_length, 0);
self.output_buffer.resize(target_length, 0);
}
// TODO use clone_from when available
unsafe {
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
let written_size = encode_sorted_native(
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr(),
self.output_buffer.len() as size_t,
);
return &self.output_buffer[0..written_size];
}
}
// pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] {
// self.input_buffer.clear();
// let input_len = input.len();
// if input_len + 10000 >= self.input_buffer.len() {
// let target_length = input_len + 1024;
// self.input_buffer.resize(target_length, 0);
// self.output_buffer.resize(target_length, 0);
// }
// // TODO use clone_from when available
// unsafe {
// ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
// let written_size = encode_unsorted_native(
// self.input_buffer.as_mut_ptr(),
// input_len as size_t,
// self.output_buffer.as_mut_ptr(),
// self.output_buffer.len() as size_t,
// );
// return &self.output_buffer[0..written_size];
// }
// }
}
pub struct Decoder;
impl Decoder {
pub fn new() -> Decoder {
Decoder
}
pub fn decode_sorted(&self,
compressed_data: &[u32],
uncompressed_values: &mut [u32]) -> size_t {
unsafe {
return decode_sorted_native(
compressed_data.as_ptr(),
compressed_data.len() as size_t,
uncompressed_values.as_mut_ptr(),
uncompressed_values.len() as size_t);
}
}
// pub fn decode_unsorted(&self,
// compressed_data: &[u32],
// uncompressed_values: &mut [u32]) -> size_t {
// unsafe {
// return decode_unsorted_native(
// compressed_data.as_ptr(),
// compressed_data.len() as size_t,
// uncompressed_values.as_mut_ptr(),
// uncompressed_values.len() as size_t);
// }
// }
}
//
// pub struct Intersector {
// output_buffer: Vec<u32>,
// }
//
// impl Intersector {
// fn new() -> Intersector {
// Intersector::with_capacity(1_000_000)
// }
// fn with_capacity(capacity: usize) -> Intersector {
// Intersector {
// output_buffer: iter::repeat(0u32).take(capacity).collect()
// }
// }
// fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] {
// let max_intersection_length = min(left.len(), right.len());
// if self.output_buffer.len() < max_intersection_length {
// self.output_buffer.resize(max_intersection_length, 0);
// }
// unsafe {
// let intersection_len = intersection_native(
// left.as_ptr(), left.len() as size_t,
// right.as_ptr(), right.len() as size_t,
// self.output_buffer.as_mut_ptr());
// return &self.output_buffer[0..intersection_len];
// }
// }
// }
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[test]
fn test_encode_big() {
let mut encoder = Encoder::new();
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new();
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}
// #[test]
// fn test_encode_unsorted() {
// let mut encoder = Encoder::new();
// let num_ints = 10_000 as usize;
// let expected_length = 4361;
// let input: Vec<u32> = (0..num_ints as u32)
// .map(|i| i * 213_127 % 501)
// .into_iter().collect();
// assert_eq!(input.len(), 10_000);
// let encoded_data = encoder.encode_unsorted(&input);
// assert_eq!(encoded_data.len(), expected_length);
// let decoder = Decoder::new();
// let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
// assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data));
// assert_eq!(decoded_data, input);
// }
//
// #[test]
// fn test_simd_intersection() {
// let mut intersector = Intersector::new();
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
// let intersection = intersector.intersection(&arr1[..], &arr2[..]) ;
// assert_eq!(intersection.len(), 500_233);
// }
#[bench]
fn bench_decode(b: &mut Bencher) {
const TEST_SIZE: usize = 1_000_000;
let arr = generate_array(TEST_SIZE, 0.1);
let mut encoder = Encoder::new();
let encoded = encoder.encode_sorted(&arr);
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
let decoder = Decoder;
b.iter(|| {
decoder.decode_sorted(&encoded, &mut uncompressed);
});
}
// #[bench]
// fn bench_simd_intersection(b: &mut Bencher) {
// let mut intersector = Intersector::new();
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
// b.iter(|| {
// intersector.intersection(&arr1[..], &arr2[..]).len()
// });
// }
}

View File

@@ -1,286 +0,0 @@
use core::directory::WritePtr;
use std::cell::RefCell;
use core::schema::DocId;
use core::schema::Document;
use core::schema::TextFieldValue;
use core::serialize::BinarySerializable;
use core::directory::ReadOnlySource;
use std::io::Write;
use std::io::Read;
use std::io::Cursor;
use std::io;
use std::io::SeekFrom;
use std::io::Seek;
use std::cmp::Ordering;
use lz4;
// TODO cache uncompressed pages
const BLOCK_SIZE: usize = 131_072;
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
struct OffsetIndex(DocId, u64);
pub struct StoreWriter {
doc: DocId,
offsets: Vec<OffsetIndex>, // TODO have a better index.
written: u64,
writer: WritePtr,
intermediary_buffer: Vec<u8>,
current_block: Vec<u8>,
}
impl BinarySerializable for OffsetIndex {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let OffsetIndex(a, b) = *self;
Ok(try!(a.serialize(writer)) + try!(b.serialize(writer)))
}
fn deserialize(reader: &mut Read) -> io::Result<OffsetIndex> {
let a = try!(DocId::deserialize(reader));
let b = try!(u64::deserialize(reader));
Ok(OffsetIndex(a, b))
}
}
impl StoreWriter {
pub fn new(writer: WritePtr) -> StoreWriter {
StoreWriter {
doc: 0,
written: 0,
offsets: Vec::new(),
writer: writer,
intermediary_buffer: Vec::new(),
current_block: Vec::new(),
}
}
pub fn stack_reader(&mut self, reader: &StoreReader) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
match reader.offsets.last() {
Some(&OffsetIndex(ref num_docs, ref body_size)) => {
try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize]));
for &OffsetIndex(doc, offset) in reader.offsets.iter() {
self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset));
}
self.written += *body_size;
self.doc += *num_docs;
Ok(())
},
None => {
Err(io::Error::new(io::ErrorKind::Other, "No offset for reader"))
}
}
}
pub fn store<'a>(&mut self, field_values: &Vec<&'a TextFieldValue>) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
for field_value in field_values.iter() {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block));
try!(self.current_block.write_all(&self.intermediary_buffer[..]));
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
try!(self.write_and_compress_block());
}
Ok(())
}
fn write_and_compress_block(&mut self,) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = lz4::EncoderBuilder::new()
.build(&mut self.intermediary_buffer)
.unwrap();
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
let compressed_block_size = self.intermediary_buffer.len() as u64;
self.written += try!((compressed_block_size as u32).serialize(&mut self.writer)) as u64;
try!(self.writer.write_all(&self.intermediary_buffer));
self.written += compressed_block_size;
self.offsets.push(OffsetIndex(self.doc, self.written));
self.current_block.clear();
Ok(())
}
pub fn close(&mut self,) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.written;
try!(self.offsets.serialize(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
self.writer.flush()
}
}
pub struct StoreReader {
data: ReadOnlySource,
offsets: Vec<OffsetIndex>,
current_block: RefCell<Vec<u8>>,
}
impl StoreReader {
fn read_header(data: &ReadOnlySource) -> Vec<OffsetIndex> {
// TODO err
// the first offset is implicitely (0, 0)
let mut offsets = vec!(OffsetIndex(0, 0));
let mut cursor = Cursor::new(data.as_slice());
cursor.seek(SeekFrom::End(-8)).unwrap();
let offset = u64::deserialize(&mut cursor).unwrap();
cursor.seek(SeekFrom::Start(offset)).unwrap();
offsets.append(&mut Vec::deserialize(&mut cursor).unwrap());
offsets
}
fn block_offset(&self, seek: &DocId) -> OffsetIndex {
fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex {
let m = offsets.len() / 2;
let pivot_offset = &offsets[m];
if offsets.len() <= 1 {
return pivot_offset.clone()
}
match pivot_offset.0.cmp(seek) {
Ordering::Less => search(&offsets[m..], seek),
Ordering::Equal => pivot_offset.clone(),
Ordering::Greater => search(&offsets[..m], seek),
}
}
search(&self.offsets, seek)
}
fn read_block(&self, block_offset: usize) -> io::Result<()> {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let total_buffer = self.data.as_slice();
let mut cursor = Cursor::new(&total_buffer[block_offset..]);
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap();
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())
}
pub fn get(&self, doc_id: &DocId) -> io::Result<Document> {
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
try!(self.read_block(block_offset as usize));
let mut current_block_mut = self.current_block.borrow_mut();
let mut cursor = Cursor::new(&mut current_block_mut[..]);
for _ in first_doc_id..*doc_id {
let block_length = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Current(block_length as i64)));
}
try!(u32::deserialize(&mut cursor));
let mut text_field_values = Vec::new();
let num_fields = try!(u32::deserialize(&mut cursor));
for _ in 0..num_fields {
let text_field_value = try!(TextFieldValue::deserialize(&mut cursor));
text_field_values.push(text_field_value);
}
let u32_field_values = Vec::new();
Ok(Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values,
})
}
pub fn new(data: ReadOnlySource) -> StoreReader {
let offsets = StoreReader::read_header(&data);
StoreReader {
data: data,
offsets: offsets,
current_block: RefCell::new(Vec::new()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use std::path::PathBuf;
use core::schema::Schema;
use core::schema::TextOptions;
use core::schema::TextFieldValue;
use core::directory::{RAMDirectory, Directory, MmapDirectory, WritePtr};
fn write_lorem_ipsum_store(writer: WritePtr) -> Schema {
let mut schema = Schema::new();
let field_body = schema.add_text_field("body", &TextOptions::new().set_stored());
let field_title = schema.add_text_field("title", &TextOptions::new().set_stored());
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..1000 {
let mut fields: Vec<TextFieldValue> = Vec::new();
{
let field_value = TextFieldValue {
field: field_body.clone(),
text: lorem.clone(),
};
fields.push(field_value);
}
{
let title_text = format!("Doc {}", i);
let field_value = TextFieldValue {
field: field_title.clone(),
text: title_text,
};
fields.push(field_value);
}
let fields_refs: Vec<&TextFieldValue> = fields.iter().collect();
store_writer.store(&fields_refs).unwrap();
}
store_writer.close().unwrap();
}
schema
}
#[test]
fn test_store() {
let path = PathBuf::from("store");
let mut directory = RAMDirectory::create();
let store_file = directory.open_write(&path).unwrap();
let schema = write_lorem_ipsum_store(store_file);
let field_title = schema.text_field("title");
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
for i in (0..10).map(|i| i * 3 / 2) {
assert_eq!(*store.get(&i).unwrap().get_first_text(&field_title).unwrap(), format!("Doc {}", i));
}
}
#[bench]
fn bench_store_encode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
});
}
#[bench]
fn bench_store_decode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
b.iter(|| {
store.get(&12).unwrap();
});
}
}

View File

@@ -1,119 +0,0 @@
use time::PreciseTime;
use rustc_serialize::json::ToJson;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
pub struct TimerHandle<'a> {
name: &'static str,
timer_tree: &'a mut TimerTree,
start: PreciseTime,
depth: u32,
}
impl<'a> TimerHandle<'a> {
pub fn open(&mut self, name: &'static str) -> TimerHandle {
TimerHandle {
name: name,
timer_tree: self.timer_tree,
start: PreciseTime::now(),
depth: self.depth + 1,
}
}
}
impl<'a> Drop for TimerHandle<'a> {
fn drop(&mut self,) {
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
depth: self.depth,
});
}
}
#[derive(Debug)]
pub struct Timing {
name: &'static str,
duration: i64,
depth: u32,
}
#[derive(Debug)]
pub struct TimerTree {
timings: Vec<Timing>,
}
impl TimerTree {
pub fn new() -> TimerTree {
TimerTree {
timings: Vec::new(),
}
}
pub fn open(&mut self, name: &'static str) -> TimerHandle {
TimerHandle {
name: name,
timer_tree: self,
start: PreciseTime::now(),
depth: 0,
}
}
}
fn to_json_obj(timings: &[Timing], root_depth: u32) -> Json {
let last = timings.len() - 1;
let last_timing = &timings[last];
let mut d = BTreeMap::new();
d.insert("name".to_string(), last_timing.name.to_json());
d.insert("duration".to_string(), last_timing.duration.to_json());
if timings.len() > 1 {
d.insert("children".to_string(), to_json_array(&timings[..last], root_depth + 1));
}
Json::Object(d)
}
fn to_json_array(timings: &[Timing], root_depth: u32) -> Json {
let mut offsets: Vec<usize> = vec!(0);
for offset in timings.iter()
.enumerate()
.filter(|&(offset, timing)| timing.depth == root_depth)
.map(|(offset, _)| offset) {
offsets.push(offset + 1);
}
let items: Vec<Json> = offsets.iter()
.zip(offsets[1..].iter())
.map(|(&start, &stop)| to_json_obj(&timings[start..stop], root_depth))
.collect();
Json::Array(items)
}
impl ToJson for TimerTree {
fn to_json(&self) -> Json {
to_json_array(&self.timings[..], 0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_timer() {
let mut timer_tree = TimerTree::new();
{
let mut a = timer_tree.open("a");
{
let mut ab = a.open("b");
{
let abc = ab.open("c");
}
{
let abd = ab.open("d");
}
}
}
assert_eq!(timer_tree.timings.len(), 4);
}
}

View File

@@ -1,13 +1,17 @@
use core::schema::*;
use DocId;
use schema::Schema;
use schema::Document;
use schema::Term;
use schema::TextFieldValue;
use core::codec::*;
use core::index::Index;
use core::analyzer::SimpleTokenizer;
use analyzer::SimpleTokenizer;
use core::index::SerializableSegment;
use core::analyzer::StreamingIterator;
use analyzer::StreamingIterator;
use core::index::Segment;
use core::index::SegmentInfo;
use core::postings::PostingsWriter;
use core::fastfield::U32FastFieldsWriter;
use postings::PostingsWriter;
use fastfield::U32FastFieldsWriter;
use std::clone::Clone;
use std::sync::mpsc;
use std::thread;
@@ -158,11 +162,13 @@ impl SegmentWriter {
let field_options = schema.text_field_options(&field_value.field);
if field_options.is_tokenized_indexed() {
let mut tokens = self.tokenizer.tokenize(&field_value.text);
let mut pos = 0u32;
loop {
match tokens.next() {
Some(token) => {
let term = Term::from_field_text(&field_value.field, token);
self.postings_writer.suscribe(doc_id, term);
self.postings_writer.suscribe(doc_id, pos.clone(), term);
pos += 1;
},
None => { break; }
}
@@ -173,7 +179,7 @@ impl SegmentWriter {
let field_options = schema.u32_field_options(&field_value.field);
if field_options.is_indexed() {
let term = Term::from_field_u32(&field_value.field, field_value.value);
self.postings_writer.suscribe(doc_id, term);
self.postings_writer.suscribe(doc_id, 0.clone(), term);
}
}
self.fast_field_writers.add_document(&doc);

View File

@@ -5,8 +5,9 @@ use std::io::Cursor;
use fst;
use fst::raw::Fst;
use fst::Streamer;
use core::directory::ReadOnlySource;
use core::serialize::BinarySerializable;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
@@ -125,9 +126,8 @@ impl<V: BinarySerializable> FstMap<V> {
#[cfg(test)]
mod tests {
use super::*;
use core::directory::{RAMDirectory, Directory};
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]

4
src/datastruct/mod.rs Normal file
View File

@@ -0,0 +1,4 @@
mod fstmap;
pub use self::fstmap::FstMapBuilder;
pub use self::fstmap::FstMap;
pub use self::fstmap::FstMapIter;

View File

@@ -0,0 +1,23 @@
use std::marker::Send;
use std::marker::Sync;
use std::io;
use std::fmt;
use std::path::Path;
use directory::{ReadOnlySource, WritePtr};
///////////////////////////////////////////////////////////////
//
// #[derive(Debug)]
// pub enum CreateError {
// RootDirectoryDoesNotExist,
// DirectoryAlreadyExists,
// CannotCreateTempDirectory(io::Error),
// }
pub trait Directory: fmt::Debug + Send + Sync {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource>;
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr>;
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
fn sync(&self, path: &Path) -> io::Result<()>;
fn sync_directory(&self,) -> io::Result<()>;
}

View File

@@ -0,0 +1,99 @@
use std::path::{Path, PathBuf};
use tempdir::TempDir;
use std::collections::HashMap;
use std::collections::hash_map::Entry as HashMapEntry;
use fst::raw::MmapReadOnly;
use std::fs::File;
use atomicwrites;
use std::sync::RwLock;
use std::fmt;
use std::io::Write;
use std::io;
use directory::Directory;
use directory::ReadOnlySource;
use directory::WritePtr;
use std::io::BufWriter;
////////////////////////////////////////////////////////////////
// MmapDirectory
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: RwLock<HashMap<PathBuf, MmapReadOnly>>,
_temp_directory: Option<TempDir>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
// TODO error management
let tempdir = try!(TempDir::new("index"));
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: Some(tempdir)
};
Ok(directory)
}
pub fn create(filepath: &Path) -> io::Result<MmapDirectory> {
Ok(MmapDirectory {
root_path: PathBuf::from(filepath),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: None
})
}
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.root_path.join(relative_path)
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().unwrap();
let mmap = match mmap_cache.entry(full_path.clone()) {
HashMapEntry::Occupied(e) => e.get().clone(),
HashMapEntry::Vacant(vacant_entry) => {
let new_mmap = try!(MmapReadOnly::open_path(full_path.clone()));
vacant_entry.insert(new_mmap.clone());
new_mmap
}
};
Ok(ReadOnlySource::Mmap(mmap))
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = self.resolve_path(path);
let file = try!(File::create(full_path));
let buf_writer = BufWriter::new(file);
Ok(Box::new(buf_writer))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, path: &Path) -> io::Result<()> {
let full_path = self.resolve_path(path);
File::open(&full_path).and_then(|fd| fd.sync_all())
}
fn sync_directory(&self,) -> io::Result<()> {
File::open(&self.root_path).and_then(|fd| fd.sync_all())
}
}

113
src/directory/mod.rs Normal file
View File

@@ -0,0 +1,113 @@
mod mmap_directory;
mod ram_directory;
mod directory;
use std::ops::Deref;
use std::io::{Seek, Write, Cursor};
use fst::raw::MmapReadOnly;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::mmap_directory::MmapDirectory;
////////////////////////////////////////
// WritePtr
pub trait SeekableWrite: Seek + Write {}
impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = Box<SeekableWrite>;
////////////////////////////////////////
// Read only source.
pub enum ReadOnlySource {
Mmap(MmapReadOnly),
Anonymous(Vec<u8>),
}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
self.as_slice()
}
}
impl ReadOnlySource {
pub fn len(&self,) -> usize {
self.as_slice().len()
}
pub fn as_slice(&self,) -> &[u8] {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
pub fn cursor<'a>(&'a self) -> Cursor<&'a [u8]> {
Cursor::new(&self.deref())
}
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
let sliced_data: Vec<u8> = Vec::from(&shared_vec[from_offset..to_offset]);
ReadOnlySource::Anonymous(sliced_data)
},
}
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice(0, self.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
fn test_directory(directory: &mut Directory) {
{
let mut write_file = directory.open_write(Path::new("toto")).unwrap();
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
}
let read_file = directory.open_read(Path::new("toto")).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data.len(), 5);
assert_eq!(data[0], 4);
assert_eq!(data[1], 3);
assert_eq!(data[2], 7);
assert_eq!(data[3], 3);
assert_eq!(data[4], 5);
}
}

View File

@@ -0,0 +1,87 @@
use directory::{Directory, ReadOnlySource};
use std::io::{Cursor, Write, Seek, SeekFrom};
use std::io;
use atomicwrites;
use std::fmt;
use std::sync::{Arc, RwLock};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use directory::WritePtr;
#[derive(Clone)]
struct SharedVec(Arc<RwLock<Cursor<Vec<u8>>>>);
pub struct RAMDirectory {
fs: HashMap<PathBuf, SharedVec>,
}
impl SharedVec {
fn new() -> SharedVec {
SharedVec(Arc::new( RwLock::new(Cursor::new(Vec::new())) ))
}
}
impl Write for SharedVec {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
try!(self.0.write().unwrap().write(buf));
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl Seek for SharedVec {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.0.write().unwrap().seek(pos)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RAMDirectory")
}
}
impl RAMDirectory {
pub fn create() -> RAMDirectory {
RAMDirectory {
fs: HashMap::new()
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
match self.fs.get(path) {
Some(ref data) => {
let data_copy = (*data).0.read().unwrap().clone();
Ok(ReadOnlySource::Anonymous(data_copy.into_inner()))
},
None =>
Err(io::Error::new(io::ErrorKind::NotFound, format!("File has never been created. {:?}", path)))
}
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = PathBuf::from(&path);
let data = SharedVec::new();
self.fs.insert(full_path, data.clone());
Ok(Box::new(data))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, _: &Path) -> io::Result<()> {
Ok(())
}
fn sync_directory(&self,) -> io::Result<()> {
Ok(())
}
}

View File

@@ -5,6 +5,7 @@ use std::num::Wrapping;
// ported from libdivide.h by ridiculous_fish
const LIBDIVIDE_32_SHIFT_MASK: u8 = 0x1F;
const LIBDIVIDE_ADD_MARKER: u8 = 0x40;
const LIBDIVIDE_U32_SHIFT_PATH: u8 = 0x80;

View File

@@ -1,300 +1,39 @@
use std::io::Write;
use std::io;
use std::io::SeekFrom;
use std::io::Seek;
use core::directory::WritePtr;
use core::serialize::BinarySerializable;
use core::directory::ReadOnlySource;
use std::collections::HashMap;
use core::schema::DocId;
use core::schema::Schema;
use core::schema::Document;
use std::ops::Deref;
use core::fastdivide::count_leading_zeros;
use core::fastdivide::DividerU32;
use core::schema::U32Field;
mod fastdivide;
mod reader;
mod writer;
mod serializer;
pub fn compute_num_bits(amplitude: u32) -> u8 {
pub use self::fastdivide::DividerU32;
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
pub use self::serializer::FastFieldSerializer;
use self::fastdivide::count_leading_zeros;
fn compute_num_bits(amplitude: u32) -> u8 {
32u8 - count_leading_zeros(amplitude)
}
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(U32Field, u32)>,
num_bits: u8,
min_value: u32,
field_open: bool,
mini_buffer_written: usize,
mini_buffer: u64,
}
impl FastFieldSerializer {
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
Ok(FastFieldSerializer {
write: write,
written_size: written_size,
fields: Vec::new(),
num_bits: 0u8,
field_open: false,
mini_buffer_written: 0,
mini_buffer: 0,
min_value: 0,
})
}
pub fn new_u32_fast_field(&mut self, field: U32Field, min_value: u32, max_value: u32) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.written_size as u32));
let write: &mut Write = &mut self.write;
self.written_size += try!(min_value.serialize(write));
let amplitude = max_value - min_value;
self.written_size += try!(amplitude.serialize(write));
self.num_bits = compute_num_bits(amplitude);
Ok(())
}
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let write: &mut Write = &mut self.write;
if self.mini_buffer_written + (self.num_bits as usize) > 64 {
self.written_size += try!(self.mini_buffer.serialize(write));
self.mini_buffer = 0;
self.mini_buffer_written = 0;
}
self.mini_buffer |= ((val - self.min_value) as u64) << self.mini_buffer_written;
self.mini_buffer_written += self.num_bits as usize;
Ok(())
}
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
if self.mini_buffer_written > 0 {
self.mini_buffer_written = 0;
self.written_size += try!(self.mini_buffer.serialize(&mut self.write));
}
self.mini_buffer = 0;
Ok(())
}
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.written_size;
self.written_size += try!(self.fields.serialize(&mut self.write));
try!(self.write.seek(SeekFrom::Start(0)));
try!((header_offset as u32).serialize(&mut self.write));
Ok(self.written_size)
}
}
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
}
impl U32FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<U32Field> = schema.get_u32_fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.option.is_fast())
.map(|(field_id, _)| U32Field(field_id as u8))
.collect();
U32FastFieldsWriter::new(u32_fields)
}
pub fn new(fields: Vec<U32Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
field_writers: fields
.iter()
.map(|field| U32FastFieldWriter::new(&field))
.collect(),
}
}
pub fn add_document(&mut self, doc: &Document) {
for field_writer in self.field_writers.iter_mut() {
field_writer.add_document(doc);
}
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in self.field_writers.iter() {
try!(field_writer.serialize(serializer));
}
Ok(())
}
}
pub struct U32FastFieldWriter {
field: U32Field,
vals: Vec<u32>,
}
impl U32FastFieldWriter {
pub fn new(field: &U32Field) -> U32FastFieldWriter {
U32FastFieldWriter {
field: field.clone(),
vals: Vec::new(),
}
}
pub fn add_val(&mut self, val: u32) {
self.vals.push(val);
}
pub fn add_document(&mut self, doc: &Document) {
let val = doc.get_u32(&self.field).unwrap_or(0u32);
self.add_val(val);
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = self.vals.iter().min().unwrap_or(&zero).clone();
let max = self.vals.iter().max().unwrap_or(&min).clone();
try!(serializer.new_u32_fast_field(self.field.clone(), min, max));
for val in self.vals.iter() {
try!(serializer.add_val(val.clone()));
}
serializer.close_field()
}
}
pub struct U32FastFieldReader {
_data: ReadOnlySource,
data_ptr: *const u64,
min_val: u32,
max_val: u32,
num_bits: u8,
mask: u32,
num_in_pack: u32,
divider: DividerU32,
}
impl U32FastFieldReader {
pub fn min_val(&self,) -> u32 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
self.max_val
}
pub fn open(data: ReadOnlySource) -> io::Result<U32FastFieldReader> {
let min_val;
let amplitude;
{
let mut cursor = data.cursor();
min_val = try!(u32::deserialize(&mut cursor));
amplitude = try!(u32::deserialize(&mut cursor));
}
let num_bits = compute_num_bits(amplitude);
let mask = (1 << num_bits) - 1;
let num_in_pack = 64u32 / (num_bits as u32);
let ptr: *const u8 = &(data.deref()[8 as usize]);
Ok(U32FastFieldReader {
_data: data,
data_ptr: ptr as *const u64,
min_val: min_val,
max_val: min_val + amplitude,
num_bits: num_bits,
mask: mask,
num_in_pack: num_in_pack,
divider: DividerU32::divide_by(num_in_pack),
})
}
pub fn get(&self, doc: DocId) -> u32 {
let long_addr = self.divider.divide(doc);
let ord_within_long = doc - long_addr * self.num_in_pack;
let bit_shift = (self.num_bits as u32) * ord_within_long;
let val_unshifted_unmasked: u64 = unsafe { *self.data_ptr.offset(long_addr as isize) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
return self.min_val + (val_shifted & self.mask);
}
}
pub struct U32FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<U32Field, (u32, u32)>,
}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(U32Field, u32)>;
{
let mut cursor = source.cursor();
header_offset = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Start(header_offset as u64)));
field_offsets = try!(Vec::deserialize(&mut cursor));
}
let mut end_offsets: Vec<u32> = field_offsets
.iter()
.map(|&(_, offset)| offset.clone())
.collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<U32Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = field_start_offsets.clone();
field_offsets_map.insert(field.clone(), (start_offset.clone(), stop_offset.clone()));
}
Ok(U32FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
pub fn get_field(&self, field: &U32Field) -> io::Result<U32FastFieldReader> {
match self.field_offsets.get(field) {
Some(&(start, stop)) => {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
}
None => {
Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?"))
}
}
}
}
#[cfg(test)]
mod tests {
use super::compute_num_bits;
use super::U32FastFieldsReader;
use super::U32FastFieldsWriter;
use core::schema::U32Field;
use super::FastFieldSerializer;
use schema::U32Field;
use std::path::Path;
use core::directory::WritePtr;
use core::directory::Directory;
use core::schema::Document;
use core::directory::RAMDirectory;
use core::schema::Schema;
use core::schema::FAST_U32;
use core::fastfield::FastFieldSerializer;
use directory::{Directory, WritePtr, RAMDirectory};
use schema::Document;
use schema::Schema;
use schema::FAST_U32;
use test::Bencher;
use test;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
@@ -330,7 +69,7 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 29 as usize);
assert_eq!(source.len(), 26 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
@@ -365,7 +104,7 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 61 as usize);
assert_eq!(source.len(), 58 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();

113
src/fastfield/reader.rs Normal file
View File

@@ -0,0 +1,113 @@
use std::io;
use std::io::{SeekFrom, Seek};
use std::collections::HashMap;
use std::ops::Deref;
use directory::ReadOnlySource;
use fastfield::DividerU32;
use common::BinarySerializable;
use DocId;
use schema::U32Field;
use super::compute_num_bits;
pub struct U32FastFieldReader {
_data: ReadOnlySource,
data_ptr: *const u64,
min_val: u32,
max_val: u32,
num_bits: u8,
mask: u32,
num_in_pack: u32,
divider: DividerU32,
}
impl U32FastFieldReader {
pub fn min_val(&self,) -> u32 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
self.max_val
}
pub fn open(data: ReadOnlySource) -> io::Result<U32FastFieldReader> {
let min_val;
let amplitude;
{
let mut cursor = data.cursor();
min_val = try!(u32::deserialize(&mut cursor));
amplitude = try!(u32::deserialize(&mut cursor));
}
let num_bits = compute_num_bits(amplitude);
let mask = (1 << num_bits) - 1;
let num_in_pack = 64u32 / (num_bits as u32);
let ptr: *const u8 = &(data.deref()[8 as usize]);
Ok(U32FastFieldReader {
_data: data,
data_ptr: ptr as *const u64,
min_val: min_val,
max_val: min_val + amplitude,
num_bits: num_bits,
mask: mask,
num_in_pack: num_in_pack,
divider: DividerU32::divide_by(num_in_pack),
})
}
pub fn get(&self, doc: DocId) -> u32 {
let long_addr = self.divider.divide(doc);
let ord_within_long = doc - long_addr * self.num_in_pack;
let bit_shift = (self.num_bits as u32) * ord_within_long;
let val_unshifted_unmasked: u64 = unsafe { *self.data_ptr.offset(long_addr as isize) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
return self.min_val + (val_shifted & self.mask);
}
}
pub struct U32FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<U32Field, (u32, u32)>,
}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(U32Field, u32)>;
{
let mut cursor = source.cursor();
header_offset = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Start(header_offset as u64)));
field_offsets = try!(Vec::deserialize(&mut cursor));
}
let mut end_offsets: Vec<u32> = field_offsets
.iter()
.map(|&(_, offset)| offset.clone())
.collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<U32Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = field_start_offsets.clone();
field_offsets_map.insert(field.clone(), (start_offset.clone(), stop_offset.clone()));
}
Ok(U32FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
pub fn get_field(&self, field: &U32Field) -> io::Result<U32FastFieldReader> {
match self.field_offsets.get(field) {
Some(&(start, stop)) => {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
}
None => {
Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?"))
}
}
}
}

View File

@@ -0,0 +1,87 @@
use common::BinarySerializable;
use directory::WritePtr;
use schema::U32Field;
use std::io;
use std::io::{SeekFrom, Write};
use super::compute_num_bits;
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(U32Field, u32)>,
num_bits: u8,
min_value: u32,
field_open: bool,
mini_buffer_written: usize,
mini_buffer: u64,
}
impl FastFieldSerializer {
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
Ok(FastFieldSerializer {
write: write,
written_size: written_size,
fields: Vec::new(),
num_bits: 0u8,
field_open: false,
mini_buffer_written: 0,
mini_buffer: 0,
min_value: 0,
})
}
pub fn new_u32_fast_field(&mut self, field: U32Field, min_value: u32, max_value: u32) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.written_size as u32));
let write: &mut Write = &mut self.write;
self.written_size += try!(min_value.serialize(write));
let amplitude = max_value - min_value;
self.written_size += try!(amplitude.serialize(write));
self.num_bits = compute_num_bits(amplitude);
Ok(())
}
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let write: &mut Write = &mut self.write;
if self.mini_buffer_written + (self.num_bits as usize) > 64 {
self.written_size += try!(self.mini_buffer.serialize(write));
self.mini_buffer = 0;
self.mini_buffer_written = 0;
}
self.mini_buffer |= ((val - self.min_value) as u64) << self.mini_buffer_written;
self.mini_buffer_written += self.num_bits as usize;
Ok(())
}
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
if self.mini_buffer_written > 0 {
self.mini_buffer_written = 0;
self.written_size += try!(self.mini_buffer.serialize(&mut self.write));
}
self.mini_buffer = 0;
Ok(())
}
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.written_size;
self.written_size += try!(self.fields.serialize(&mut self.write));
try!(self.write.seek(SeekFrom::Start(0)));
try!((header_offset as u32).serialize(&mut self.write));
Ok(self.written_size)
}
}

76
src/fastfield/writer.rs Normal file
View File

@@ -0,0 +1,76 @@
use schema::{Schema, U32Field, Document};
use fastfield::FastFieldSerializer;
use std::io;
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
}
impl U32FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<U32Field> = schema.get_u32_fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.option.is_fast())
.map(|(field_id, _)| U32Field(field_id as u8))
.collect();
U32FastFieldsWriter::new(u32_fields)
}
pub fn new(fields: Vec<U32Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
field_writers: fields
.iter()
.map(|field| U32FastFieldWriter::new(&field))
.collect(),
}
}
pub fn add_document(&mut self, doc: &Document) {
for field_writer in self.field_writers.iter_mut() {
field_writer.add_document(doc);
}
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in self.field_writers.iter() {
try!(field_writer.serialize(serializer));
}
Ok(())
}
}
pub struct U32FastFieldWriter {
field: U32Field,
vals: Vec<u32>,
}
impl U32FastFieldWriter {
pub fn new(field: &U32Field) -> U32FastFieldWriter {
U32FastFieldWriter {
field: field.clone(),
vals: Vec::new(),
}
}
pub fn add_val(&mut self, val: u32) {
self.vals.push(val);
}
pub fn add_document(&mut self, doc: &Document) {
let val = doc.get_u32(&self.field).unwrap_or(0u32);
self.add_val(val);
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = self.vals.iter().min().unwrap_or(&zero).clone();
let max = self.vals.iter().max().unwrap_or(&min).clone();
try!(serializer.new_u32_fast_field(self.field.clone(), min, max));
for val in self.vals.iter() {
try!(serializer.add_val(val.clone()));
}
serializer.close_field()
}
}

View File

@@ -19,7 +19,6 @@ extern crate atomicwrites;
extern crate tempdir;
extern crate bincode;
extern crate time;
extern crate serde;
extern crate libc;
extern crate lz4;
extern crate uuid;
@@ -29,20 +28,31 @@ extern crate num_cpus;
#[cfg(test)] extern crate rand;
mod core;
mod datastruct;
mod postings;
mod directory;
mod compression;
mod fastfield;
mod store;
mod common;
pub mod analyzer;
pub mod collector;
pub use core::analyzer;
pub use core::directory::Directory;
pub mod schema;
pub use directory::Directory;
pub use core::searcher::Searcher;
pub use core::index::Index;
pub use core::schema;
pub use core::schema::Term;
pub use core::schema::Document;
pub use core::collector;
pub use core::schema::DocId;
pub use schema::Term;
pub use schema::Document;
pub use core::reader::SegmentReader;
pub use core::searcher::SegmentLocalId;
pub use core::timer::TimerTree;
pub use self::common::TimerTree;
/// u32 identifying a document within a segment.
/// Document gets their doc id assigned incrementally,
/// as they are added in the segment.
pub type DocId = u32;
#[cfg(test)]
mod tests {

132
src/postings/mod.rs Normal file
View File

@@ -0,0 +1,132 @@
// pub mod postings;
// pub mod schema;
// pub mod directory;
// pub mod writer;
// pub mod analyzer;
// pub mod reader;
// pub mod codec;
// pub mod searcher;
// pub mod collector;
// pub mod serialize;
// pub mod store;
// pub mod simdcompression;
// pub mod fstmap;
// pub mod index;
// pub mod fastfield;
// pub mod fastdivide;
// pub mod merger;
// pub mod timer;
// use std::error;
// use std::io;
// pub fn convert_to_ioerror<E: 'static + error::Error + Send + Sync>(err: E) -> io::Error {
// io::Error::new(
// io::ErrorKind::InvalidData,
// err
// )
// }
mod serializer;
mod writer;
mod term_info;
use DocId;
pub use self::serializer::PostingsSerializer;
pub use self::writer::PostingsWriter;
pub use self::term_info::TermInfo;
pub trait Postings: Iterator<Item=DocId> {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId>;
}
#[cfg(test)]
mod tests {
use super::*;
use DocId;
#[derive(Debug)]
pub struct VecPostings {
doc_ids: Vec<DocId>,
cursor: usize,
}
impl VecPostings {
pub fn new(vals: Vec<DocId>) -> VecPostings {
VecPostings {
doc_ids: vals,
cursor: 0,
}
}
}
impl Postings for VecPostings {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
loop {
match Iterator::next(self) {
Some(val) if val >= target => {
return Some(val);
},
None => {
return None;
},
_ => {}
}
}
}
}
impl Iterator for VecPostings {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
if self.cursor >= self.doc_ids.len() {
None
}
else {
self.cursor += 1;
Some(self.doc_ids[self.cursor - 1])
}
}
}
// use test::Bencher;
// #[test]
// fn test_intersection() {
// {
// let left = VecPostings::new(vec!(1, 3, 9));
// let right = VecPostings::new(vec!(3, 4, 9, 18));
// let inter = IntersectionPostings::from_postings(vec!(left, right));
// let vals: Vec<DocId> = inter.collect();
// assert_eq!(vals, vec!(3, 9));
// }
// {
// let a = VecPostings::new(vec!(1, 3, 9));
// let b = VecPostings::new(vec!(3, 4, 9, 18));
// let c = VecPostings::new(vec!(1, 5, 9, 111));
// let inter = IntersectionPostings::from_postings(vec!(a, b, c));
// let vals: Vec<DocId> = inter.collect();
// assert_eq!(vals, vec!(9));
// }
// }
//
// #[bench]
// fn bench_single_intersection(b: &mut Bencher) {
// b.iter(|| {
// let docs = VecPostings::new((0..1_000_000).collect());
// let intersection = IntersectionPostings::from_postings(vec!(docs));
// intersection.count()
// });
// }
}

View File

@@ -0,0 +1,74 @@
use datastruct::FstMapBuilder;
use super::TermInfo;
use schema::Term;
use directory::WritePtr;
use compression::S4BP128Encoder;
use DocId;
use core::index::Segment;
use std::io;
use core::index::SegmentComponent;
use common::BinarySerializable;
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
postings_write: WritePtr,
positions_write: WritePtr,
written_bytes_postings: usize,
written_bytes_positions: usize,
encoder: S4BP128Encoder,
doc_ids: Vec<DocId>,
}
impl PostingsSerializer {
pub fn open(segment: &Segment) -> io::Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
positions_write: positions_write,
written_bytes_postings: 0,
written_bytes_positions: 0,
encoder: S4BP128Encoder::new(),
doc_ids: Vec::new(),
})
}
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
try!(self.close_term());
self.doc_ids.clear();
let term_info = TermInfo {
doc_freq: doc_freq,
postings_offset: self.written_bytes_postings as u32,
};
self.terms_fst_builder
.insert(term.as_slice(), &term_info)
}
pub fn close_term(&mut self,) -> io::Result<()> {
if !self.doc_ids.is_empty() {
let docs_data = self.encoder.encode_sorted(&self.doc_ids);
self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
for num in docs_data {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}
}
Ok(())
}
pub fn write_doc(&mut self, doc_id: DocId, positions: Option<&[u32]>) -> io::Result<()> {
self.doc_ids.push(doc_id);
Ok(())
}
pub fn close(mut self,) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
Ok(())
}
}

26
src/postings/term_info.rs Normal file
View File

@@ -0,0 +1,26 @@
use common::BinarySerializable;
use std::io;
#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
pub struct TermInfo {
pub doc_freq: u32,
pub postings_offset: u32,
}
impl BinarySerializable for TermInfo {
fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> {
Ok(
try!(self.doc_freq.serialize(writer)) +
try!(self.postings_offset.serialize(writer))
)
}
fn deserialize(reader: &mut io::Read) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let offset = try!(u32::deserialize(reader));
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: offset,
})
}
}

141
src/postings/writer.rs Normal file
View File

@@ -0,0 +1,141 @@
use DocId;
use std::collections::BTreeMap;
use schema::Term;
use postings::PostingsSerializer;
use std::io;
pub trait U32sRecorder {
fn new() -> Self;
fn record(&mut self, val: u32);
}
pub struct VecRecorder(Vec<u32>);
impl U32sRecorder for VecRecorder {
fn new() -> VecRecorder {
VecRecorder(Vec::new())
}
fn record(&mut self, val: u32) {
self.0.push(val);
}
}
pub struct ObliviousRecorder;
impl U32sRecorder for ObliviousRecorder {
fn new() -> ObliviousRecorder {
ObliviousRecorder
}
fn record(&mut self, _: u32) {
}
}
struct TermPostingsWriter<TermFreqsRec: U32sRecorder, PositionsRec: U32sRecorder> {
doc_ids: Vec<DocId>,
term_freqs: TermFreqsRec,
positions: PositionsRec,
current_position: u32,
current_freq: u32,
}
impl<TermFreqsRec: U32sRecorder, PositionsRec: U32sRecorder> TermPostingsWriter<TermFreqsRec, PositionsRec> {
pub fn new() -> TermPostingsWriter<TermFreqsRec, PositionsRec> {
TermPostingsWriter {
doc_ids: Vec::new(),
term_freqs: TermFreqsRec::new(),
positions: PositionsRec::new(),
current_position: 0u32,
current_freq: 0u32,
}
}
fn close_doc(&mut self,) {
self.term_freqs.record(self.current_freq);
self.current_freq = 0;
self.current_position = 0;
}
fn close(&mut self,) {
if self.current_freq > 0 {
self.close_doc();
}
}
fn is_new_doc(&self, doc: &DocId) -> bool {
match self.doc_ids.last() {
Some(&last_doc) => last_doc != *doc,
None => true,
}
}
pub fn doc_freq(&self) -> u32 {
self.doc_ids.len() as u32
}
pub fn suscribe(&mut self, doc: DocId, pos: u32) {
if self.is_new_doc(&doc) {
// this is the first time we meet this term for this document
// first close the previous document, and write its doc_freq.
self.close_doc();
self.doc_ids.push(doc);
}
self.current_freq += 1;
self.positions.record(pos - self.current_position);
self.current_position = pos;
}
}
pub struct PostingsWriter {
postings: Vec<TermPostingsWriter<ObliviousRecorder, ObliviousRecorder>>,
term_index: BTreeMap<Term, usize>,
}
impl PostingsWriter {
pub fn new() -> PostingsWriter {
PostingsWriter {
postings: Vec::new(),
term_index: BTreeMap::new(),
}
}
pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) {
let doc_ids: &mut TermPostingsWriter<ObliviousRecorder, ObliviousRecorder> = self.get_term_postings(term);
doc_ids.suscribe(doc, pos);
}
fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<ObliviousRecorder, ObliviousRecorder> {
match self.term_index.get(&term) {
Some(unord_id) => {
return &mut self.postings[*unord_id];
},
None => {}
}
let unord_id = self.term_index.len();
self.postings.push(TermPostingsWriter::new());
self.term_index.insert(term, unord_id.clone());
&mut self.postings[unord_id]
}
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> {
for (term, postings_id) in self.term_index.iter() {
let term_postings_writer = &self.postings[postings_id.clone()];
let term_docfreq = term_postings_writer.doc_freq();
try!(serializer.new_term(&term, term_docfreq));
for doc in term_postings_writer.doc_ids.iter() {
try!(serializer.write_doc(doc.clone(), None));
}
}
Ok(())
}
}

97
src/schema/document.rs Normal file
View File

@@ -0,0 +1,97 @@
use std::slice;
use super::*;
///
/// Document are really just a list of field values.
///
/// # Examples
///
/// ```
/// use tantivy::schema::Schema;
/// use tantivy::schema::TEXT;
///
/// let mut schema = Schema::new();
/// schema.add_text_field("body", &TEXT);
/// let field_text = schema.text_field("body");
/// ```
///
#[derive(Debug)]
pub struct Document {
pub text_field_values: Vec<TextFieldValue>,
pub u32_field_values: Vec<U32FieldValue>,
}
impl Document {
pub fn new() -> Document {
Document {
text_field_values: Vec::new(),
u32_field_values: Vec::new(),
}
}
pub fn from(text_field_values: Vec<TextFieldValue>,
u32_field_values: Vec<U32FieldValue>) -> Document {
Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values
}
}
pub fn len(&self,) -> usize {
self.text_field_values.len()
}
pub fn set(&mut self, field: &TextField, text: &str) {
self.add(TextFieldValue {
field: field.clone(),
text: String::from(text)
});
}
pub fn set_u32(&mut self, field: &U32Field, value: u32) {
self.u32_field_values.push(U32FieldValue {
field: field.clone(),
value: value
});
}
pub fn add(&mut self, field_value: TextFieldValue) {
self.text_field_values.push(field_value);
}
pub fn text_fields<'a>(&'a self,) -> slice::Iter<'a, TextFieldValue> {
self.text_field_values.iter()
}
pub fn u32_fields<'a>(&'a self,) -> slice::Iter<'a, U32FieldValue> {
self.u32_field_values.iter()
}
pub fn get_u32(&self, field: &U32Field) -> Option<u32> {
self.u32_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.value)
.cloned()
.next()
}
pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.collect()
}
pub fn get_first_text<'a>(&'a self, field: &TextField) -> Option<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.next()
}
}

21
src/schema/mod.rs Normal file
View File

@@ -0,0 +1,21 @@
mod schema;
mod term;
mod document;
mod text_field;
mod u32_field;
pub use self::schema::Schema;
pub use self::document::Document;
pub use self::term::Term;
pub use self::text_field::TextField;
pub use self::text_field::TextFieldValue;
pub use self::text_field::TextOptions;
pub use self::text_field::FAST;
pub use self::text_field::TEXT;
pub use self::text_field::STORED;
pub use self::u32_field::U32Field;
pub use self::u32_field::U32FieldValue;
pub use self::u32_field::U32Options;
pub use self::u32_field::FAST_U32;

180
src/schema/schema.rs Normal file
View File

@@ -0,0 +1,180 @@
use std::collections::HashMap;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use std::borrow::Borrow;
use super::*;
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
pub struct TextFieldEntry {
name: String,
option: TextOptions,
}
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
pub struct U32FieldEntry {
pub name: String,
pub option: U32Options,
}
/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::{Schema, TextOptions};
///
/// fn create_schema() -> Schema {
/// let mut schema = Schema::new();
/// let str_fieldtype = TextOptions::new();
/// let text_fieldtype = TextOptions::new().set_tokenized_indexed();
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let title_field = schema.add_text_field("title", &text_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// schema
/// }
///
/// let schema = create_schema();
#[derive(Clone, Debug)]
pub struct Schema {
text_fields: Vec<TextFieldEntry>,
text_fields_map: HashMap<String, TextField>, // transient
u32_fields: Vec<U32FieldEntry>,
u32_fields_map: HashMap<String, U32Field>, // transient
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema = Schema::new();
try!(d.read_seq(|d, num_fields| {
for _ in 0..num_fields {
let field_entry = try!(TextFieldEntry::decode(d));
let field_options: &TextOptions = &field_entry.option;
schema.add_text_field(&field_entry.name, field_options);
}
Ok(())
}));
Ok(schema)
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.text_fields.len(),
|mut e| {
for (ord, field) in self.text_fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
}
Ok(())
}));
Ok(())
}
}
impl Schema {
/// Creates a new, empty schema.
pub fn new() -> Schema {
Schema {
text_fields: Vec::new(),
text_fields_map: HashMap::new(),
u32_fields: Vec::new(),
u32_fields_map: HashMap::new(),
}
}
pub fn get_u32_fields(&self,) -> &Vec<U32FieldEntry> {
&self.u32_fields
}
/// Given a name, returns the field handle, as well as its associated TextOptions
pub fn get_text_field(&self, field_name: &str) -> Option<(TextField, TextOptions)> {
self.text_fields_map
.get(field_name)
.map(|&TextField(field_id)| {
let field_options = self.text_fields[field_id as usize].option.clone();
(TextField(field_id), field_options)
})
}
pub fn get_u32_field(&self, field_name: &str) -> Option<(U32Field, U32Options)> {
self.u32_fields_map
.get(field_name)
.map(|&U32Field(field_id)| {
let u32_field_options = self.u32_fields[field_id as usize].option.clone();
(U32Field(field_id), u32_field_options)
})
}
/// Returns the field options associated with a given name.
///
/// # Panics
/// Panics if the field name does not exist.
/// It is meant as an helper for user who created
/// and control the content of their schema.
///
/// If panicking is not an option for you,
/// you may use `get(&self, field_name: &str)`.
pub fn text_field(&self, fieldname: &str) -> TextField {
self.text_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
pub fn u32_field(&self, fieldname: &str) -> U32Field {
self.u32_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
/// Returns the field options associated to a field handle.
pub fn text_field_options(&self, field: &TextField) -> TextOptions {
let TextField(field_id) = *field;
self.text_fields[field_id as usize].option.clone()
}
pub fn u32_field_options(&self, field: &U32Field) -> U32Options {
let U32Field(field_id) = *field;
self.u32_fields[field_id as usize].option.clone()
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_text_field<RefTextOptions: Borrow<TextOptions>>(&mut self, field_name_str: &str, field_options: RefTextOptions) -> TextField {
let field = TextField(self.text_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.text_fields.push(TextFieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.text_fields_map.insert(field_name, field.clone());
field
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_u32_field<RefU32Options: Borrow<U32Options>>(&mut self, field_name_str: &str, field_options: RefU32Options) -> U32Field {
let field = U32Field(self.u32_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.u32_fields.push(U32FieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.u32_fields_map.insert(field_name, field.clone());
field
}
}

54
src/schema/term.rs Normal file
View File

@@ -0,0 +1,54 @@
use std::io::Write;
use std::fmt;
use common::BinarySerializable;
use super::U32Field;
use super::TextField;
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term {
data: Vec<u8>,
}
impl Term {
pub fn from_field_u32(field: &U32Field, val: u32) -> Term {
let mut buffer = Vec::with_capacity(1 + 4);
let U32Field(field_idx) = *field;
buffer.clear();
buffer.push(128 | field_idx);
val.serialize(&mut buffer).unwrap();
Term {
data: buffer,
}
}
pub fn from_field_text(field: &TextField, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let TextField(field_idx) = *field;
buffer.clear();
buffer.push(field_idx);
buffer.extend(text.as_bytes());
Term {
data: buffer,
}
}
pub fn from(data: &[u8]) -> Term {
Term {
data: Vec::from(data),
}
}
pub fn as_slice(&self,)->&[u8] {
&self.data
}
}
impl fmt::Debug for Term {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Term({})", self.data[0])
}
}

167
src/schema/text_field.rs Normal file
View File

@@ -0,0 +1,167 @@
use std::io::Write;
use std::io;
use std::io::Read;
use common::BinarySerializable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use std::ops::BitOr;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct TextField(pub u8);
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct TextOptions {
tokenized_indexed: bool,
stored: bool,
fast: bool,
}
impl TextOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
}
pub fn is_stored(&self,) -> bool {
self.stored
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_stored(mut self,) -> TextOptions {
self.stored = true;
self
}
pub fn set_fast(mut self,) -> TextOptions {
self.fast = true;
self
}
pub fn set_tokenized_indexed(mut self,) -> TextOptions {
self.tokenized_indexed = true;
self
}
pub fn new() -> TextOptions {
TextOptions {
fast: false,
tokenized_indexed: false,
stored: false,
}
}
}
impl BinarySerializable for TextField {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let TextField(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<TextField> {
u8::deserialize(reader).map(TextField)
}
}
impl BinarySerializable for TextFieldValue {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
Ok(
try!(self.field.serialize(writer)) +
try!(self.text.serialize(writer))
)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let field = try!(TextField::deserialize(reader));
let text = try!(String::deserialize(reader));
Ok(TextFieldValue {
field: field,
text: text,
})
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct TextFieldValue {
pub field: TextField,
pub text: String,
}
/// The field will be tokenized and indexed
pub const TEXT: TextOptions = TextOptions {
tokenized_indexed: true,
stored: false,
fast: false,
};
/// A stored fields of a document can be retrieved given its DocId.
/// Stored field are stored together and LZ4 compressed.
/// Reading the stored fields of a document is relatively slow.
/// (100 microsecs)
pub const STORED: TextOptions = TextOptions {
tokenized_indexed: false,
stored: true,
fast: false,
};
/// Fast field are used for field you need to access many times during
/// collection. (e.g: for sort, aggregates).
pub const FAST: TextOptions = TextOptions {
tokenized_indexed: false,
stored: false,
fast: true
};
impl BitOr for TextOptions {
type Output = TextOptions;
fn bitor(self, other: TextOptions) -> TextOptions {
let mut res = TextOptions::new();
res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
res.stored = self.stored || other.stored;
res.fast = self.fast || other.fast;
res
}
}
#[cfg(test)]
mod tests {
use schema::Schema;
use super::*;
#[test]
fn test_field_options() {
{
let field_options = STORED | FAST;
assert!(field_options.is_stored());
assert!(field_options.is_fast());
assert!(!field_options.is_tokenized_indexed());
}
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_fast());
assert!(field_options.is_tokenized_indexed());
}
{
let mut schema = Schema::new();
let _body_field: TextField = schema.add_text_field("body", &TEXT);
let field = schema.text_field("body");
assert!(schema.text_field_options(&field).is_tokenized_indexed());
}
}
}

76
src/schema/u32_field.rs Normal file
View File

@@ -0,0 +1,76 @@
use std::io;
use std::io::Write;
use std::io::Read;
use common::BinarySerializable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct U32Field(pub u8);
impl BinarySerializable for U32Field {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let U32Field(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<U32Field> {
u8::deserialize(reader).map(U32Field)
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct U32FieldValue {
pub field: U32Field,
pub value: u32,
}
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct U32Options {
indexed: bool,
fast: bool,
stored: bool,
}
impl U32Options {
pub fn new() -> U32Options {
U32Options {
fast: false,
indexed: false,
stored: false,
}
}
pub fn is_indexed(&self,) -> bool {
self.indexed
}
pub fn set_indexed(mut self,) -> U32Options {
self.indexed = true;
self
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_fast(mut self,) -> U32Options {
self.fast = true;
self
}
}
/// The field will be tokenized and indexed
pub const FAST_U32: U32Options = U32Options {
indexed: false,
stored: false,
fast: true,
};

91
src/store/mod.rs Normal file
View File

@@ -0,0 +1,91 @@
mod reader;
mod writer;
use DocId;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
pub struct OffsetIndex(DocId, u64);
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use std::path::PathBuf;
use schema::Schema;
use schema::TextOptions;
use schema::TextFieldValue;
use directory::{RAMDirectory, Directory, MmapDirectory, WritePtr};
fn write_lorem_ipsum_store(writer: WritePtr) -> Schema {
let mut schema = Schema::new();
let field_body = schema.add_text_field("body", &TextOptions::new().set_stored());
let field_title = schema.add_text_field("title", &TextOptions::new().set_stored());
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..1000 {
let mut fields: Vec<TextFieldValue> = Vec::new();
{
let field_value = TextFieldValue {
field: field_body.clone(),
text: lorem.clone(),
};
fields.push(field_value);
}
{
let title_text = format!("Doc {}", i);
let field_value = TextFieldValue {
field: field_title.clone(),
text: title_text,
};
fields.push(field_value);
}
let fields_refs: Vec<&TextFieldValue> = fields.iter().collect();
store_writer.store(&fields_refs).unwrap();
}
store_writer.close().unwrap();
}
schema
}
#[test]
fn test_store() {
let path = PathBuf::from("store");
let mut directory = RAMDirectory::create();
let store_file = directory.open_write(&path).unwrap();
let schema = write_lorem_ipsum_store(store_file);
let field_title = schema.text_field("title");
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
for i in (0..10).map(|i| i * 3 / 2) {
assert_eq!(*store.get(&i).unwrap().get_first_text(&field_title).unwrap(), format!("Doc {}", i));
}
}
#[bench]
fn bench_store_encode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
});
}
#[bench]
fn bench_store_decode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
b.iter(|| {
store.get(&12).unwrap();
});
}
}

96
src/store/reader.rs Normal file
View File

@@ -0,0 +1,96 @@
use directory::ReadOnlySource;
use std::cell::RefCell;
use DocId;
use schema::Document;
use schema::TextFieldValue;
use common::BinarySerializable;
use std::io::Read;
use std::io::Cursor;
use std::io;
use std::io::SeekFrom;
use std::io::Seek;
use std::cmp::Ordering;
use lz4;
use super::OffsetIndex;
pub struct StoreReader {
pub data: ReadOnlySource,
pub offsets: Vec<OffsetIndex>,
current_block: RefCell<Vec<u8>>,
}
impl StoreReader {
fn read_header(data: &ReadOnlySource) -> Vec<OffsetIndex> {
// TODO err
// the first offset is implicitely (0, 0)
let mut offsets = vec!(OffsetIndex(0, 0));
let mut cursor = Cursor::new(data.as_slice());
cursor.seek(SeekFrom::End(-8)).unwrap();
let offset = u64::deserialize(&mut cursor).unwrap();
cursor.seek(SeekFrom::Start(offset)).unwrap();
offsets.append(&mut Vec::deserialize(&mut cursor).unwrap());
offsets
}
fn block_offset(&self, seek: &DocId) -> OffsetIndex {
fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex {
let m = offsets.len() / 2;
let pivot_offset = &offsets[m];
if offsets.len() <= 1 {
return pivot_offset.clone()
}
match pivot_offset.0.cmp(seek) {
Ordering::Less => search(&offsets[m..], seek),
Ordering::Equal => pivot_offset.clone(),
Ordering::Greater => search(&offsets[..m], seek),
}
}
search(&self.offsets, seek)
}
fn read_block(&self, block_offset: usize) -> io::Result<()> {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let total_buffer = self.data.as_slice();
let mut cursor = Cursor::new(&total_buffer[block_offset..]);
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap();
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())
}
pub fn get(&self, doc_id: &DocId) -> io::Result<Document> {
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
try!(self.read_block(block_offset as usize));
let mut current_block_mut = self.current_block.borrow_mut();
let mut cursor = Cursor::new(&mut current_block_mut[..]);
for _ in first_doc_id..*doc_id {
let block_length = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Current(block_length as i64)));
}
try!(u32::deserialize(&mut cursor));
let mut text_field_values = Vec::new();
let num_fields = try!(u32::deserialize(&mut cursor));
for _ in 0..num_fields {
let text_field_value = try!(TextFieldValue::deserialize(&mut cursor));
text_field_values.push(text_field_value);
}
let u32_field_values = Vec::new();
Ok(Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values,
})
}
pub fn new(data: ReadOnlySource) -> StoreReader {
let offsets = StoreReader::read_header(&data);
StoreReader {
data: data,
offsets: offsets,
current_block: RefCell::new(Vec::new()),
}
}
}

112
src/store/writer.rs Normal file
View File

@@ -0,0 +1,112 @@
use directory::WritePtr;
use DocId;
use schema::TextFieldValue;
use common::BinarySerializable;
use std::io::Write;
use std::io::Read;
use std::io;
use lz4;
use super::StoreReader;
use super::OffsetIndex;
const BLOCK_SIZE: usize = 131_072;
pub struct StoreWriter {
doc: DocId,
offsets: Vec<OffsetIndex>, // TODO have a better index.
written: u64,
writer: WritePtr,
intermediary_buffer: Vec<u8>,
current_block: Vec<u8>,
}
impl BinarySerializable for OffsetIndex {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let OffsetIndex(a, b) = *self;
Ok(try!(a.serialize(writer)) + try!(b.serialize(writer)))
}
fn deserialize(reader: &mut Read) -> io::Result<OffsetIndex> {
let a = try!(DocId::deserialize(reader));
let b = try!(u64::deserialize(reader));
Ok(OffsetIndex(a, b))
}
}
impl StoreWriter {
pub fn new(writer: WritePtr) -> StoreWriter {
StoreWriter {
doc: 0,
written: 0,
offsets: Vec::new(),
writer: writer,
intermediary_buffer: Vec::new(),
current_block: Vec::new(),
}
}
pub fn stack_reader(&mut self, reader: &StoreReader) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
match reader.offsets.last() {
Some(&OffsetIndex(ref num_docs, ref body_size)) => {
try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize]));
for &OffsetIndex(doc, offset) in reader.offsets.iter() {
self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset));
}
self.written += *body_size;
self.doc += *num_docs;
Ok(())
},
None => {
Err(io::Error::new(io::ErrorKind::Other, "No offset for reader"))
}
}
}
pub fn store<'a>(&mut self, field_values: &Vec<&'a TextFieldValue>) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
for field_value in field_values.iter() {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block));
try!(self.current_block.write_all(&self.intermediary_buffer[..]));
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
try!(self.write_and_compress_block());
}
Ok(())
}
fn write_and_compress_block(&mut self,) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = lz4::EncoderBuilder::new()
.build(&mut self.intermediary_buffer)
.unwrap();
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
let compressed_block_size = self.intermediary_buffer.len() as u64;
self.written += try!((compressed_block_size as u32).serialize(&mut self.writer)) as u64;
try!(self.writer.write_all(&self.intermediary_buffer));
self.written += compressed_block_size;
self.offsets.push(OffsetIndex(self.doc, self.written));
self.current_block.clear();
Ok(())
}
pub fn close(&mut self,) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.written;
try!(self.offsets.serialize(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
self.writer.flush()
}
}